1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_vm.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/domainset.h> 48 #include <sys/lock.h> 49 #include <sys/kernel.h> 50 #include <sys/kthread.h> 51 #include <sys/malloc.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/queue.h> 55 #include <sys/rwlock.h> 56 #include <sys/sbuf.h> 57 #include <sys/sched.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/tslog.h> 61 #include <sys/unistd.h> 62 #include <sys/vmmeter.h> 63 64 #include <ddb/ddb.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_extern.h> 68 #include <vm/vm_param.h> 69 #include <vm/vm_kern.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_phys.h> 72 #include <vm/vm_pagequeue.h> 73 74 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 75 "Too many physsegs."); 76 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t), 77 "vm_paddr_t too big for ffsll, flsll."); 78 79 #ifdef NUMA 80 struct mem_affinity __read_mostly *mem_affinity; 81 int __read_mostly *mem_locality; 82 83 static int numa_disabled; 84 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 85 "NUMA options"); 86 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 87 &numa_disabled, 0, "NUMA-awareness in the allocators is disabled"); 88 #endif 89 90 int __read_mostly vm_ndomains = 1; 91 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 92 93 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 94 int __read_mostly vm_phys_nsegs; 95 static struct vm_phys_seg vm_phys_early_segs[8]; 96 static int vm_phys_early_nsegs; 97 98 struct vm_phys_fictitious_seg; 99 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 100 struct vm_phys_fictitious_seg *); 101 102 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 103 RB_INITIALIZER(&vm_phys_fictitious_tree); 104 105 struct vm_phys_fictitious_seg { 106 RB_ENTRY(vm_phys_fictitious_seg) node; 107 /* Memory region data */ 108 vm_paddr_t start; 109 vm_paddr_t end; 110 vm_page_t first_page; 111 }; 112 113 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 114 vm_phys_fictitious_cmp); 115 116 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 117 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 118 119 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 120 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 121 [VM_NFREEORDER_MAX]; 122 123 static int __read_mostly vm_nfreelists; 124 125 /* 126 * These "avail lists" are globals used to communicate boot-time physical 127 * memory layout to other parts of the kernel. Each physically contiguous 128 * region of memory is defined by a start address at an even index and an 129 * end address at the following odd index. Each list is terminated by a 130 * pair of zero entries. 131 * 132 * dump_avail tells the dump code what regions to include in a crash dump, and 133 * phys_avail is all of the remaining physical memory that is available for 134 * the vm system. 135 * 136 * Initially dump_avail and phys_avail are identical. Boot time memory 137 * allocations remove extents from phys_avail that may still be included 138 * in dumps. 139 */ 140 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 141 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 142 143 /* 144 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 145 */ 146 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 147 static int __read_mostly vm_default_freepool; 148 149 CTASSERT(VM_FREELIST_DEFAULT == 0); 150 151 #ifdef VM_FREELIST_DMA32 152 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 153 #endif 154 155 /* 156 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 157 * the ordering of the free list boundaries. 158 */ 159 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 160 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 161 #endif 162 163 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 164 SYSCTL_OID(_vm, OID_AUTO, phys_free, 165 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 166 sysctl_vm_phys_free, "A", 167 "Phys Free Info"); 168 169 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 170 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 171 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 172 sysctl_vm_phys_segs, "A", 173 "Phys Seg Info"); 174 175 #ifdef NUMA 176 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 177 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 178 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 179 sysctl_vm_phys_locality, "A", 180 "Phys Locality Info"); 181 #endif 182 183 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 184 &vm_ndomains, 0, "Number of physical memory domains available."); 185 186 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 187 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 188 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 189 int order, int pool, int tail); 190 191 static bool __diagused 192 vm_phys_pool_valid(int pool) 193 { 194 #ifdef VM_FREEPOOL_LAZYINIT 195 if (pool == VM_FREEPOOL_LAZYINIT) 196 return (false); 197 #endif 198 return (pool >= 0 && pool < VM_NFREEPOOL); 199 } 200 201 /* 202 * Red-black tree helpers for vm fictitious range management. 203 */ 204 static inline int 205 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 206 struct vm_phys_fictitious_seg *range) 207 { 208 209 KASSERT(range->start != 0 && range->end != 0, 210 ("Invalid range passed on search for vm_fictitious page")); 211 if (p->start >= range->end) 212 return (1); 213 if (p->start < range->start) 214 return (-1); 215 216 return (0); 217 } 218 219 static int 220 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 221 struct vm_phys_fictitious_seg *p2) 222 { 223 224 /* Check if this is a search for a page */ 225 if (p1->end == 0) 226 return (vm_phys_fictitious_in_range(p1, p2)); 227 228 KASSERT(p2->end != 0, 229 ("Invalid range passed as second parameter to vm fictitious comparison")); 230 231 /* Searching to add a new range */ 232 if (p1->end <= p2->start) 233 return (-1); 234 if (p1->start >= p2->end) 235 return (1); 236 237 panic("Trying to add overlapping vm fictitious ranges:\n" 238 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 239 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 240 } 241 242 int 243 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used, 244 vm_paddr_t high __numa_used) 245 { 246 #ifdef NUMA 247 domainset_t mask; 248 int i; 249 250 if (vm_ndomains == 1 || mem_affinity == NULL) 251 return (0); 252 253 DOMAINSET_ZERO(&mask); 254 /* 255 * Check for any memory that overlaps low, high. 256 */ 257 for (i = 0; mem_affinity[i].end != 0; i++) 258 if (mem_affinity[i].start <= high && 259 mem_affinity[i].end >= low) 260 DOMAINSET_SET(mem_affinity[i].domain, &mask); 261 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 262 return (prefer); 263 if (DOMAINSET_EMPTY(&mask)) 264 panic("vm_phys_domain_match: Impossible constraint"); 265 return (DOMAINSET_FFS(&mask) - 1); 266 #else 267 return (0); 268 #endif 269 } 270 271 /* 272 * Outputs the state of the physical memory allocator, specifically, 273 * the amount of physical memory in each free list. 274 */ 275 static int 276 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 277 { 278 struct sbuf sbuf; 279 struct vm_freelist *fl; 280 int dom, error, flind, oind, pind; 281 282 error = sysctl_wire_old_buffer(req, 0); 283 if (error != 0) 284 return (error); 285 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 286 for (dom = 0; dom < vm_ndomains; dom++) { 287 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 288 for (flind = 0; flind < vm_nfreelists; flind++) { 289 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 290 "\n ORDER (SIZE) | NUMBER" 291 "\n ", flind); 292 for (pind = 0; pind < VM_NFREEPOOL; pind++) 293 sbuf_printf(&sbuf, " | POOL %d", pind); 294 sbuf_printf(&sbuf, "\n-- "); 295 for (pind = 0; pind < VM_NFREEPOOL; pind++) 296 sbuf_printf(&sbuf, "-- -- "); 297 sbuf_printf(&sbuf, "--\n"); 298 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 299 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 300 1 << (PAGE_SHIFT - 10 + oind)); 301 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 302 fl = vm_phys_free_queues[dom][flind][pind]; 303 sbuf_printf(&sbuf, " | %6d", 304 fl[oind].lcnt); 305 } 306 sbuf_printf(&sbuf, "\n"); 307 } 308 } 309 } 310 error = sbuf_finish(&sbuf); 311 sbuf_delete(&sbuf); 312 return (error); 313 } 314 315 /* 316 * Outputs the set of physical memory segments. 317 */ 318 static int 319 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 320 { 321 struct sbuf sbuf; 322 struct vm_phys_seg *seg; 323 int error, segind; 324 325 error = sysctl_wire_old_buffer(req, 0); 326 if (error != 0) 327 return (error); 328 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 329 for (segind = 0; segind < vm_phys_nsegs; segind++) { 330 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 331 seg = &vm_phys_segs[segind]; 332 sbuf_printf(&sbuf, "start: %#jx\n", 333 (uintmax_t)seg->start); 334 sbuf_printf(&sbuf, "end: %#jx\n", 335 (uintmax_t)seg->end); 336 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 337 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 338 } 339 error = sbuf_finish(&sbuf); 340 sbuf_delete(&sbuf); 341 return (error); 342 } 343 344 /* 345 * Return affinity, or -1 if there's no affinity information. 346 */ 347 int 348 vm_phys_mem_affinity(int f __numa_used, int t __numa_used) 349 { 350 351 #ifdef NUMA 352 if (mem_locality == NULL) 353 return (-1); 354 if (f >= vm_ndomains || t >= vm_ndomains) 355 return (-1); 356 return (mem_locality[f * vm_ndomains + t]); 357 #else 358 return (-1); 359 #endif 360 } 361 362 #ifdef NUMA 363 /* 364 * Outputs the VM locality table. 365 */ 366 static int 367 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 368 { 369 struct sbuf sbuf; 370 int error, i, j; 371 372 error = sysctl_wire_old_buffer(req, 0); 373 if (error != 0) 374 return (error); 375 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 376 377 sbuf_printf(&sbuf, "\n"); 378 379 for (i = 0; i < vm_ndomains; i++) { 380 sbuf_printf(&sbuf, "%d: ", i); 381 for (j = 0; j < vm_ndomains; j++) { 382 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 383 } 384 sbuf_printf(&sbuf, "\n"); 385 } 386 error = sbuf_finish(&sbuf); 387 sbuf_delete(&sbuf); 388 return (error); 389 } 390 #endif 391 392 static void 393 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int pool, 394 int tail) 395 { 396 397 m->order = order; 398 m->pool = pool; 399 if (tail) 400 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 401 else 402 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 403 fl[order].lcnt++; 404 } 405 406 static void 407 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 408 { 409 410 TAILQ_REMOVE(&fl[order].pl, m, listq); 411 fl[order].lcnt--; 412 m->order = VM_NFREEORDER; 413 } 414 415 /* 416 * Create a physical memory segment. 417 */ 418 static void 419 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 420 { 421 struct vm_phys_seg *seg; 422 423 if (!(0 <= domain && domain < vm_ndomains)) 424 panic("%s: Invalid domain %d ('vm_ndomains' is %d)", 425 __func__, domain, vm_ndomains); 426 if (vm_phys_nsegs >= VM_PHYSSEG_MAX) 427 panic("Not enough storage for physical segments, " 428 "increase VM_PHYSSEG_MAX"); 429 430 seg = &vm_phys_segs[vm_phys_nsegs++]; 431 while (seg > vm_phys_segs && seg[-1].start >= end) { 432 *seg = *(seg - 1); 433 seg--; 434 } 435 seg->start = start; 436 seg->end = end; 437 seg->domain = domain; 438 if (seg != vm_phys_segs && seg[-1].end > start) 439 panic("Overlapping physical segments: Current [%#jx,%#jx) " 440 "at index %zu, previous [%#jx,%#jx)", 441 (uintmax_t)start, (uintmax_t)end, seg - vm_phys_segs, 442 (uintmax_t)seg[-1].start, (uintmax_t)seg[-1].end); 443 } 444 445 static void 446 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 447 { 448 #ifdef NUMA 449 int i; 450 451 if (mem_affinity == NULL) { 452 _vm_phys_create_seg(start, end, 0); 453 return; 454 } 455 456 for (i = 0;; i++) { 457 if (mem_affinity[i].end == 0) 458 panic("Reached end of affinity info"); 459 if (mem_affinity[i].end <= start) 460 continue; 461 if (mem_affinity[i].start > start) 462 panic("No affinity info for start %jx", 463 (uintmax_t)start); 464 if (mem_affinity[i].end >= end) { 465 _vm_phys_create_seg(start, end, 466 mem_affinity[i].domain); 467 break; 468 } 469 _vm_phys_create_seg(start, mem_affinity[i].end, 470 mem_affinity[i].domain); 471 start = mem_affinity[i].end; 472 } 473 #else 474 _vm_phys_create_seg(start, end, 0); 475 #endif 476 } 477 478 /* 479 * Add a physical memory segment. 480 */ 481 void 482 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 483 { 484 vm_paddr_t paddr; 485 486 if ((start & PAGE_MASK) != 0) 487 panic("%s: start (%jx) is not page aligned", __func__, 488 (uintmax_t)start); 489 if ((end & PAGE_MASK) != 0) 490 panic("%s: end (%jx) is not page aligned", __func__, 491 (uintmax_t)end); 492 if (start > end) 493 panic("%s: start (%jx) > end (%jx)!", __func__, 494 (uintmax_t)start, (uintmax_t)end); 495 496 if (start == end) 497 return; 498 499 /* 500 * Split the physical memory segment if it spans two or more free 501 * list boundaries. 502 */ 503 paddr = start; 504 #ifdef VM_FREELIST_LOWMEM 505 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 506 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 507 paddr = VM_LOWMEM_BOUNDARY; 508 } 509 #endif 510 #ifdef VM_FREELIST_DMA32 511 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 512 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 513 paddr = VM_DMA32_BOUNDARY; 514 } 515 #endif 516 vm_phys_create_seg(paddr, end); 517 } 518 519 /* 520 * Initialize the physical memory allocator. 521 * 522 * Requires that vm_page_array is initialized! 523 */ 524 void 525 vm_phys_init(void) 526 { 527 struct vm_freelist *fl; 528 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 529 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) 530 u_long npages; 531 #endif 532 int dom, flind, freelist, oind, pind, segind; 533 534 /* 535 * Compute the number of free lists, and generate the mapping from the 536 * manifest constants VM_FREELIST_* to the free list indices. 537 * 538 * Initially, the entries of vm_freelist_to_flind[] are set to either 539 * 0 or 1 to indicate which free lists should be created. 540 */ 541 #ifdef VM_DMA32_NPAGES_THRESHOLD 542 npages = 0; 543 #endif 544 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 545 seg = &vm_phys_segs[segind]; 546 #ifdef VM_FREELIST_LOWMEM 547 if (seg->end <= VM_LOWMEM_BOUNDARY) 548 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 549 else 550 #endif 551 #ifdef VM_FREELIST_DMA32 552 if ( 553 #ifdef VM_DMA32_NPAGES_THRESHOLD 554 /* 555 * Create the DMA32 free list only if the amount of 556 * physical memory above physical address 4G exceeds the 557 * given threshold. 558 */ 559 npages > VM_DMA32_NPAGES_THRESHOLD && 560 #endif 561 seg->end <= VM_DMA32_BOUNDARY) 562 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 563 else 564 #endif 565 { 566 #ifdef VM_DMA32_NPAGES_THRESHOLD 567 npages += atop(seg->end - seg->start); 568 #endif 569 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 570 } 571 } 572 /* Change each entry into a running total of the free lists. */ 573 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 574 vm_freelist_to_flind[freelist] += 575 vm_freelist_to_flind[freelist - 1]; 576 } 577 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 578 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 579 /* Change each entry into a free list index. */ 580 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 581 vm_freelist_to_flind[freelist]--; 582 583 /* 584 * Initialize the first_page and free_queues fields of each physical 585 * memory segment. 586 */ 587 #ifdef VM_PHYSSEG_SPARSE 588 npages = 0; 589 #endif 590 for (segind = 0; segind < vm_phys_nsegs; segind++) { 591 seg = &vm_phys_segs[segind]; 592 #ifdef VM_PHYSSEG_SPARSE 593 seg->first_page = &vm_page_array[npages]; 594 npages += atop(seg->end - seg->start); 595 #else 596 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 597 #endif 598 #ifdef VM_FREELIST_LOWMEM 599 if (seg->end <= VM_LOWMEM_BOUNDARY) { 600 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 601 KASSERT(flind >= 0, 602 ("vm_phys_init: LOWMEM flind < 0")); 603 } else 604 #endif 605 #ifdef VM_FREELIST_DMA32 606 if (seg->end <= VM_DMA32_BOUNDARY) { 607 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 608 KASSERT(flind >= 0, 609 ("vm_phys_init: DMA32 flind < 0")); 610 } else 611 #endif 612 { 613 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 614 KASSERT(flind >= 0, 615 ("vm_phys_init: DEFAULT flind < 0")); 616 } 617 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 618 } 619 620 /* 621 * Coalesce physical memory segments that are contiguous and share the 622 * same per-domain free queues. 623 */ 624 prev_seg = vm_phys_segs; 625 seg = &vm_phys_segs[1]; 626 end_seg = &vm_phys_segs[vm_phys_nsegs]; 627 while (seg < end_seg) { 628 if (prev_seg->end == seg->start && 629 prev_seg->free_queues == seg->free_queues) { 630 prev_seg->end = seg->end; 631 KASSERT(prev_seg->domain == seg->domain, 632 ("vm_phys_init: free queues cannot span domains")); 633 vm_phys_nsegs--; 634 end_seg--; 635 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 636 *tmp_seg = *(tmp_seg + 1); 637 } else { 638 prev_seg = seg; 639 seg++; 640 } 641 } 642 643 /* 644 * Initialize the free queues. 645 */ 646 for (dom = 0; dom < vm_ndomains; dom++) { 647 for (flind = 0; flind < vm_nfreelists; flind++) { 648 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 649 fl = vm_phys_free_queues[dom][flind][pind]; 650 for (oind = 0; oind < VM_NFREEORDER; oind++) 651 TAILQ_INIT(&fl[oind].pl); 652 } 653 } 654 } 655 656 #ifdef VM_FREEPOOL_LAZYINIT 657 vm_default_freepool = VM_FREEPOOL_LAZYINIT; 658 #else 659 vm_default_freepool = VM_FREEPOOL_DEFAULT; 660 #endif 661 662 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 663 } 664 665 /* 666 * Register info about the NUMA topology of the system. 667 * 668 * Invoked by platform-dependent code prior to vm_phys_init(). 669 */ 670 void 671 vm_phys_register_domains(int ndomains __numa_used, 672 struct mem_affinity *affinity __numa_used, int *locality __numa_used) 673 { 674 #ifdef NUMA 675 int i; 676 677 /* 678 * For now the only override value that we support is 1, which 679 * effectively disables NUMA-awareness in the allocators. 680 */ 681 TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled); 682 if (numa_disabled) 683 ndomains = 1; 684 685 if (ndomains > 1) { 686 vm_ndomains = ndomains; 687 mem_affinity = affinity; 688 mem_locality = locality; 689 } 690 691 for (i = 0; i < vm_ndomains; i++) 692 DOMAINSET_SET(i, &all_domains); 693 #endif 694 } 695 696 /* 697 * Split a contiguous, power of two-sized set of physical pages. 698 * 699 * When this function is called by a page allocation function, the caller 700 * should request insertion at the head unless the order [order, oind) queues 701 * are known to be empty. The objective being to reduce the likelihood of 702 * long-term fragmentation by promoting contemporaneous allocation and 703 * (hopefully) deallocation. 704 */ 705 static __inline void 706 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 707 int pool, int tail) 708 { 709 vm_page_t m_buddy; 710 711 while (oind > order) { 712 oind--; 713 m_buddy = &m[1 << oind]; 714 KASSERT(m_buddy->order == VM_NFREEORDER, 715 ("vm_phys_split_pages: page %p has unexpected order %d", 716 m_buddy, m_buddy->order)); 717 vm_freelist_add(fl, m_buddy, oind, pool, tail); 718 } 719 } 720 721 static void 722 vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int pool, 723 int tail) 724 { 725 KASSERT(order >= 0 && order < VM_NFREEORDER, 726 ("%s: invalid order %d", __func__, order)); 727 728 vm_freelist_add(fl, m, order, pool, tail); 729 #ifdef VM_FREEPOOL_LAZYINIT 730 if (__predict_false(pool == VM_FREEPOOL_LAZYINIT)) { 731 vm_page_t m_next; 732 vm_paddr_t pa; 733 int npages; 734 735 npages = 1 << order; 736 m_next = m + npages; 737 pa = m->phys_addr + ptoa(npages); 738 if (pa < vm_phys_segs[m->segind].end) { 739 vm_page_init_page(m_next, pa, m->segind, 740 VM_FREEPOOL_LAZYINIT); 741 } 742 } 743 #endif 744 } 745 746 /* 747 * Add the physical pages [m, m + npages) at the beginning of a power-of-two 748 * aligned and sized set to the specified free list. 749 * 750 * When this function is called by a page allocation function, the caller 751 * should request insertion at the head unless the lower-order queues are 752 * known to be empty. The objective being to reduce the likelihood of long- 753 * term fragmentation by promoting contemporaneous allocation and (hopefully) 754 * deallocation. 755 * 756 * The physical page m's buddy must not be free. 757 */ 758 static void 759 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool, 760 int tail) 761 { 762 int order; 763 764 KASSERT(npages == 0 || 765 (VM_PAGE_TO_PHYS(m) & 766 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 767 ("%s: page %p and npages %u are misaligned", 768 __func__, m, npages)); 769 while (npages > 0) { 770 KASSERT(m->order == VM_NFREEORDER, 771 ("%s: page %p has unexpected order %d", 772 __func__, m, m->order)); 773 order = ilog2(npages); 774 KASSERT(order < VM_NFREEORDER, 775 ("%s: order %d is out of range", __func__, order)); 776 vm_phys_enq_chunk(fl, m, order, pool, tail); 777 m += 1 << order; 778 npages -= 1 << order; 779 } 780 } 781 782 /* 783 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 784 * and sized set to the specified free list. 785 * 786 * When this function is called by a page allocation function, the caller 787 * should request insertion at the head unless the lower-order queues are 788 * known to be empty. The objective being to reduce the likelihood of long- 789 * term fragmentation by promoting contemporaneous allocation and (hopefully) 790 * deallocation. 791 * 792 * If npages is zero, this function does nothing and ignores the physical page 793 * parameter m. Otherwise, the physical page m's buddy must not be free. 794 */ 795 static vm_page_t 796 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool, 797 int tail) 798 { 799 int order; 800 801 KASSERT(npages == 0 || 802 ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 803 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 804 ("vm_phys_enq_range: page %p and npages %u are misaligned", 805 m, npages)); 806 while (npages > 0) { 807 KASSERT(m->order == VM_NFREEORDER, 808 ("vm_phys_enq_range: page %p has unexpected order %d", 809 m, m->order)); 810 order = ffs(npages) - 1; 811 vm_phys_enq_chunk(fl, m, order, pool, tail); 812 m += 1 << order; 813 npages -= 1 << order; 814 } 815 return (m); 816 } 817 818 /* 819 * Complete initialization a contiguous, power of two-sized set of physical 820 * pages. 821 * 822 * If the pages currently belong to the lazy init pool, then the corresponding 823 * page structures must be initialized. In this case it is assumed that the 824 * first page in the run has already been initialized. 825 */ 826 static void 827 vm_phys_finish_init(vm_page_t m, int order) 828 { 829 #ifdef VM_FREEPOOL_LAZYINIT 830 if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { 831 vm_paddr_t pa; 832 int segind; 833 834 TSENTER(); 835 pa = m->phys_addr + PAGE_SIZE; 836 segind = m->segind; 837 for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order]; 838 m_tmp++, pa += PAGE_SIZE) 839 vm_page_init_page(m_tmp, pa, segind, VM_NFREEPOOL); 840 TSEXIT(); 841 } 842 #endif 843 } 844 845 /* 846 * Tries to allocate the specified number of pages from the specified pool 847 * within the specified domain. Returns the actual number of allocated pages 848 * and a pointer to each page through the array ma[]. 849 * 850 * The returned pages may not be physically contiguous. However, in contrast 851 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 852 * calling this function once to allocate the desired number of pages will 853 * avoid wasted time in vm_phys_split_pages(). The allocated pages have no 854 * valid pool field set. 855 * 856 * The free page queues for the specified domain must be locked. 857 */ 858 int 859 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 860 { 861 struct vm_freelist *alt, *fl; 862 vm_page_t m; 863 int avail, end, flind, freelist, i, oind, pind; 864 865 KASSERT(domain >= 0 && domain < vm_ndomains, 866 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 867 KASSERT(vm_phys_pool_valid(pool), 868 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 869 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 870 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 871 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 872 i = 0; 873 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 874 flind = vm_freelist_to_flind[freelist]; 875 if (flind < 0) 876 continue; 877 fl = vm_phys_free_queues[domain][flind][pool]; 878 for (oind = 0; oind < VM_NFREEORDER; oind++) { 879 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 880 vm_freelist_rem(fl, m, oind); 881 avail = i + (1 << oind); 882 end = imin(npages, avail); 883 while (i < end) 884 ma[i++] = m++; 885 if (i == npages) { 886 /* 887 * Return excess pages to fl. Its order 888 * [0, oind) queues are empty. 889 */ 890 vm_phys_enq_range(m, avail - i, fl, 891 pool, 1); 892 return (npages); 893 } 894 } 895 } 896 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 897 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; 898 pind++) { 899 alt = vm_phys_free_queues[domain][flind][pind]; 900 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 901 NULL) { 902 vm_freelist_rem(alt, m, oind); 903 vm_phys_finish_init(m, oind); 904 avail = i + (1 << oind); 905 end = imin(npages, avail); 906 while (i < end) 907 ma[i++] = m++; 908 if (i == npages) { 909 /* 910 * Return excess pages to fl. 911 * Its order [0, oind) queues 912 * are empty. 913 */ 914 vm_phys_enq_range(m, avail - i, 915 fl, pool, 1); 916 return (npages); 917 } 918 } 919 } 920 } 921 } 922 return (i); 923 } 924 925 /* 926 * Allocate a contiguous, power of two-sized set of physical pages from the 927 * specified free list. The free list must be specified using one of the 928 * manifest constants VM_FREELIST_*. 929 * 930 * The free page queues must be locked. 931 */ 932 static vm_page_t 933 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 934 { 935 struct vm_freelist *alt, *fl; 936 vm_page_t m; 937 int oind, pind, flind; 938 939 KASSERT(domain >= 0 && domain < vm_ndomains, 940 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 941 domain)); 942 KASSERT(freelist < VM_NFREELIST, 943 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 944 freelist)); 945 KASSERT(vm_phys_pool_valid(pool), 946 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 947 KASSERT(order < VM_NFREEORDER, 948 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 949 950 flind = vm_freelist_to_flind[freelist]; 951 /* Check if freelist is present */ 952 if (flind < 0) 953 return (NULL); 954 955 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 956 fl = &vm_phys_free_queues[domain][flind][pool][0]; 957 for (oind = order; oind < VM_NFREEORDER; oind++) { 958 m = TAILQ_FIRST(&fl[oind].pl); 959 if (m != NULL) { 960 vm_freelist_rem(fl, m, oind); 961 /* The order [order, oind) queues are empty. */ 962 vm_phys_split_pages(m, oind, fl, order, pool, 1); 963 return (m); 964 } 965 } 966 967 /* 968 * The given pool was empty. Find the largest 969 * contiguous, power-of-two-sized set of pages in any 970 * pool. Transfer these pages to the given pool, and 971 * use them to satisfy the allocation. 972 */ 973 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 974 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 975 alt = &vm_phys_free_queues[domain][flind][pind][0]; 976 m = TAILQ_FIRST(&alt[oind].pl); 977 if (m != NULL) { 978 vm_freelist_rem(alt, m, oind); 979 vm_phys_finish_init(m, oind); 980 /* The order [order, oind) queues are empty. */ 981 vm_phys_split_pages(m, oind, fl, order, pool, 1); 982 return (m); 983 } 984 } 985 } 986 return (NULL); 987 } 988 989 /* 990 * Allocate a contiguous, power of two-sized set of physical pages 991 * from the free lists. 992 * 993 * The free page queues must be locked. 994 */ 995 vm_page_t 996 vm_phys_alloc_pages(int domain, int pool, int order) 997 { 998 vm_page_t m; 999 int freelist; 1000 1001 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 1002 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 1003 if (m != NULL) 1004 return (m); 1005 } 1006 return (NULL); 1007 } 1008 1009 /* 1010 * Find the vm_page corresponding to the given physical address, which must lie 1011 * within the given physical memory segment. 1012 */ 1013 vm_page_t 1014 vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa) 1015 { 1016 KASSERT(pa >= seg->start && pa < seg->end, 1017 ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa)); 1018 1019 return (&seg->first_page[atop(pa - seg->start)]); 1020 } 1021 1022 /* 1023 * Find the vm_page corresponding to the given physical address. 1024 */ 1025 vm_page_t 1026 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 1027 { 1028 struct vm_phys_seg *seg; 1029 1030 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 1031 return (vm_phys_seg_paddr_to_vm_page(seg, pa)); 1032 return (NULL); 1033 } 1034 1035 vm_page_t 1036 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 1037 { 1038 struct vm_phys_fictitious_seg tmp, *seg; 1039 vm_page_t m; 1040 1041 m = NULL; 1042 tmp.start = pa; 1043 tmp.end = 0; 1044 1045 rw_rlock(&vm_phys_fictitious_reg_lock); 1046 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1047 rw_runlock(&vm_phys_fictitious_reg_lock); 1048 if (seg == NULL) 1049 return (NULL); 1050 1051 m = &seg->first_page[atop(pa - seg->start)]; 1052 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 1053 1054 return (m); 1055 } 1056 1057 static inline void 1058 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 1059 long page_count, vm_memattr_t memattr) 1060 { 1061 long i; 1062 1063 bzero(range, page_count * sizeof(*range)); 1064 for (i = 0; i < page_count; i++) { 1065 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 1066 range[i].oflags &= ~VPO_UNMANAGED; 1067 range[i].busy_lock = VPB_UNBUSIED; 1068 } 1069 } 1070 1071 int 1072 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 1073 vm_memattr_t memattr) 1074 { 1075 struct vm_phys_fictitious_seg *seg; 1076 vm_page_t fp; 1077 long page_count; 1078 #ifdef VM_PHYSSEG_DENSE 1079 long pi, pe; 1080 long dpage_count; 1081 #endif 1082 1083 KASSERT(start < end, 1084 ("Start of segment isn't less than end (start: %jx end: %jx)", 1085 (uintmax_t)start, (uintmax_t)end)); 1086 1087 page_count = (end - start) / PAGE_SIZE; 1088 1089 #ifdef VM_PHYSSEG_DENSE 1090 pi = atop(start); 1091 pe = atop(end); 1092 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1093 fp = &vm_page_array[pi - first_page]; 1094 if ((pe - first_page) > vm_page_array_size) { 1095 /* 1096 * We have a segment that starts inside 1097 * of vm_page_array, but ends outside of it. 1098 * 1099 * Use vm_page_array pages for those that are 1100 * inside of the vm_page_array range, and 1101 * allocate the remaining ones. 1102 */ 1103 dpage_count = vm_page_array_size - (pi - first_page); 1104 vm_phys_fictitious_init_range(fp, start, dpage_count, 1105 memattr); 1106 page_count -= dpage_count; 1107 start += ptoa(dpage_count); 1108 goto alloc; 1109 } 1110 /* 1111 * We can allocate the full range from vm_page_array, 1112 * so there's no need to register the range in the tree. 1113 */ 1114 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1115 return (0); 1116 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1117 /* 1118 * We have a segment that ends inside of vm_page_array, 1119 * but starts outside of it. 1120 */ 1121 fp = &vm_page_array[0]; 1122 dpage_count = pe - first_page; 1123 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 1124 memattr); 1125 end -= ptoa(dpage_count); 1126 page_count -= dpage_count; 1127 goto alloc; 1128 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1129 /* 1130 * Trying to register a fictitious range that expands before 1131 * and after vm_page_array. 1132 */ 1133 return (EINVAL); 1134 } else { 1135 alloc: 1136 #endif 1137 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1138 M_WAITOK); 1139 #ifdef VM_PHYSSEG_DENSE 1140 } 1141 #endif 1142 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1143 1144 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1145 seg->start = start; 1146 seg->end = end; 1147 seg->first_page = fp; 1148 1149 rw_wlock(&vm_phys_fictitious_reg_lock); 1150 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1151 rw_wunlock(&vm_phys_fictitious_reg_lock); 1152 1153 return (0); 1154 } 1155 1156 void 1157 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1158 { 1159 struct vm_phys_fictitious_seg *seg, tmp; 1160 #ifdef VM_PHYSSEG_DENSE 1161 long pi, pe; 1162 #endif 1163 1164 KASSERT(start < end, 1165 ("Start of segment isn't less than end (start: %jx end: %jx)", 1166 (uintmax_t)start, (uintmax_t)end)); 1167 1168 #ifdef VM_PHYSSEG_DENSE 1169 pi = atop(start); 1170 pe = atop(end); 1171 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1172 if ((pe - first_page) <= vm_page_array_size) { 1173 /* 1174 * This segment was allocated using vm_page_array 1175 * only, there's nothing to do since those pages 1176 * were never added to the tree. 1177 */ 1178 return; 1179 } 1180 /* 1181 * We have a segment that starts inside 1182 * of vm_page_array, but ends outside of it. 1183 * 1184 * Calculate how many pages were added to the 1185 * tree and free them. 1186 */ 1187 start = ptoa(first_page + vm_page_array_size); 1188 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1189 /* 1190 * We have a segment that ends inside of vm_page_array, 1191 * but starts outside of it. 1192 */ 1193 end = ptoa(first_page); 1194 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1195 /* Since it's not possible to register such a range, panic. */ 1196 panic( 1197 "Unregistering not registered fictitious range [%#jx:%#jx]", 1198 (uintmax_t)start, (uintmax_t)end); 1199 } 1200 #endif 1201 tmp.start = start; 1202 tmp.end = 0; 1203 1204 rw_wlock(&vm_phys_fictitious_reg_lock); 1205 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1206 if (seg->start != start || seg->end != end) { 1207 rw_wunlock(&vm_phys_fictitious_reg_lock); 1208 panic( 1209 "Unregistering not registered fictitious range [%#jx:%#jx]", 1210 (uintmax_t)start, (uintmax_t)end); 1211 } 1212 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1213 rw_wunlock(&vm_phys_fictitious_reg_lock); 1214 free(seg->first_page, M_FICT_PAGES); 1215 free(seg, M_FICT_PAGES); 1216 } 1217 1218 /* 1219 * Free a contiguous, power of two-sized set of physical pages. 1220 * The pool field in the first page determines the destination pool. 1221 * 1222 * The free page queues must be locked. 1223 */ 1224 void 1225 vm_phys_free_pages(vm_page_t m, int pool, int order) 1226 { 1227 struct vm_freelist *fl; 1228 struct vm_phys_seg *seg; 1229 vm_paddr_t pa; 1230 vm_page_t m_buddy; 1231 1232 KASSERT(m->order == VM_NFREEORDER, 1233 ("%s: page %p has unexpected order %d", 1234 __func__, m, m->order)); 1235 KASSERT(vm_phys_pool_valid(pool), 1236 ("%s: unexpected pool param %d", __func__, pool)); 1237 KASSERT(order < VM_NFREEORDER, 1238 ("%s: order %d is out of range", __func__, order)); 1239 seg = &vm_phys_segs[m->segind]; 1240 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1241 if (order < VM_NFREEORDER - 1) { 1242 pa = VM_PAGE_TO_PHYS(m); 1243 do { 1244 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1245 if (pa < seg->start || pa >= seg->end) 1246 break; 1247 m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa); 1248 if (m_buddy->order != order) 1249 break; 1250 fl = (*seg->free_queues)[m_buddy->pool]; 1251 vm_freelist_rem(fl, m_buddy, order); 1252 vm_phys_finish_init(m_buddy, order); 1253 order++; 1254 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1255 m = vm_phys_seg_paddr_to_vm_page(seg, pa); 1256 } while (order < VM_NFREEORDER - 1); 1257 } 1258 fl = (*seg->free_queues)[pool]; 1259 vm_freelist_add(fl, m, order, pool, 1); 1260 } 1261 1262 #ifdef VM_FREEPOOL_LAZYINIT 1263 /* 1264 * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving 1265 * them to the default pool. This is a prerequisite for some rare operations 1266 * which need to scan the page array and thus depend on all pages being 1267 * initialized. 1268 */ 1269 static void 1270 vm_phys_lazy_init_domain(int domain, bool locked) 1271 { 1272 static bool initdone[MAXMEMDOM]; 1273 struct vm_domain *vmd; 1274 struct vm_freelist *fl; 1275 vm_page_t m; 1276 int pind; 1277 bool unlocked; 1278 1279 if (__predict_true(atomic_load_bool(&initdone[domain]))) 1280 return; 1281 1282 vmd = VM_DOMAIN(domain); 1283 if (locked) 1284 vm_domain_free_assert_locked(vmd); 1285 else 1286 vm_domain_free_lock(vmd); 1287 if (atomic_load_bool(&initdone[domain])) 1288 goto out; 1289 pind = VM_FREEPOOL_LAZYINIT; 1290 for (int freelist = 0; freelist < VM_NFREELIST; freelist++) { 1291 int flind; 1292 1293 flind = vm_freelist_to_flind[freelist]; 1294 if (flind < 0) 1295 continue; 1296 fl = vm_phys_free_queues[domain][flind][pind]; 1297 for (int oind = 0; oind < VM_NFREEORDER; oind++) { 1298 if (atomic_load_int(&fl[oind].lcnt) == 0) 1299 continue; 1300 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 1301 /* 1302 * Avoid holding the lock across the 1303 * initialization unless there's a free page 1304 * shortage. 1305 */ 1306 vm_freelist_rem(fl, m, oind); 1307 unlocked = vm_domain_allocate(vmd, 1308 VM_ALLOC_NORMAL, 1 << oind); 1309 if (unlocked) 1310 vm_domain_free_unlock(vmd); 1311 vm_phys_finish_init(m, oind); 1312 if (unlocked) { 1313 vm_domain_freecnt_inc(vmd, 1 << oind); 1314 vm_domain_free_lock(vmd); 1315 } 1316 vm_phys_free_pages(m, VM_FREEPOOL_DEFAULT, 1317 oind); 1318 } 1319 } 1320 } 1321 atomic_store_bool(&initdone[domain], true); 1322 out: 1323 if (!locked) 1324 vm_domain_free_unlock(vmd); 1325 } 1326 1327 static void 1328 vm_phys_lazy_init(void) 1329 { 1330 for (int domain = 0; domain < vm_ndomains; domain++) 1331 vm_phys_lazy_init_domain(domain, false); 1332 atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT); 1333 } 1334 1335 static void 1336 vm_phys_lazy_init_kthr(void *arg __unused) 1337 { 1338 vm_phys_lazy_init(); 1339 kthread_exit(); 1340 } 1341 1342 static void 1343 vm_phys_lazy_sysinit(void *arg __unused) 1344 { 1345 struct thread *td; 1346 int error; 1347 1348 error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td, 1349 RFSTOPPED, 0, "vmlazyinit"); 1350 if (error == 0) { 1351 thread_lock(td); 1352 sched_prio(td, PRI_MIN_IDLE); 1353 sched_add(td, SRQ_BORING); 1354 } else { 1355 printf("%s: could not create lazy init thread: %d\n", 1356 __func__, error); 1357 vm_phys_lazy_init(); 1358 } 1359 } 1360 SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit, 1361 NULL); 1362 #endif /* VM_FREEPOOL_LAZYINIT */ 1363 1364 /* 1365 * Free a contiguous, arbitrarily sized set of physical pages, without 1366 * merging across set boundaries. Assumes no pages have a valid pool field. 1367 * 1368 * The free page queues must be locked. 1369 */ 1370 void 1371 vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages) 1372 { 1373 struct vm_freelist *fl; 1374 struct vm_phys_seg *seg; 1375 vm_page_t m_end; 1376 vm_paddr_t diff, lo; 1377 int order; 1378 1379 /* 1380 * Avoid unnecessary coalescing by freeing the pages in the largest 1381 * possible power-of-two-sized subsets. 1382 */ 1383 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1384 seg = &vm_phys_segs[m->segind]; 1385 fl = (*seg->free_queues)[pool]; 1386 m_end = m + npages; 1387 /* Free blocks of increasing size. */ 1388 lo = atop(VM_PAGE_TO_PHYS(m)); 1389 if (m < m_end && 1390 (diff = lo ^ (lo + npages - 1)) != 0) { 1391 order = min(ilog2(diff), VM_NFREEORDER - 1); 1392 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1393 pool, 1); 1394 } 1395 1396 /* Free blocks of maximum size. */ 1397 order = VM_NFREEORDER - 1; 1398 while (m + (1 << order) <= m_end) { 1399 KASSERT(seg == &vm_phys_segs[m->segind], 1400 ("%s: page range [%p,%p) spans multiple segments", 1401 __func__, m_end - npages, m)); 1402 vm_phys_enq_chunk(fl, m, order, pool, 1); 1403 m += 1 << order; 1404 } 1405 /* Free blocks of diminishing size. */ 1406 vm_phys_enq_beg(m, m_end - m, fl, pool, 1); 1407 } 1408 1409 /* 1410 * Free a contiguous, arbitrarily sized set of physical pages. 1411 * Assumes that every page but the first has no valid pool field. 1412 * Uses the pool value in the first page if valid, otherwise default. 1413 * 1414 * The free page queues must be locked. 1415 */ 1416 void 1417 vm_phys_free_contig(vm_page_t m, int pool, u_long npages) 1418 { 1419 vm_paddr_t lo; 1420 vm_page_t m_start, m_end; 1421 unsigned max_order, order_start, order_end; 1422 1423 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1424 1425 lo = atop(VM_PAGE_TO_PHYS(m)); 1426 max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1); 1427 1428 m_start = m; 1429 order_start = ffsll(lo) - 1; 1430 if (order_start < max_order) 1431 m_start += 1 << order_start; 1432 m_end = m + npages; 1433 order_end = ffsll(lo + npages) - 1; 1434 if (order_end < max_order) 1435 m_end -= 1 << order_end; 1436 /* 1437 * Avoid unnecessary coalescing by freeing the pages at the start and 1438 * end of the range last. 1439 */ 1440 if (m_start < m_end) 1441 vm_phys_enqueue_contig(m_start, pool, m_end - m_start); 1442 if (order_start < max_order) 1443 vm_phys_free_pages(m, pool, order_start); 1444 if (order_end < max_order) 1445 vm_phys_free_pages(m_end, pool, order_end); 1446 } 1447 1448 /* 1449 * Identify the first address range within segment segind or greater 1450 * that matches the domain, lies within the low/high range, and has 1451 * enough pages. Return -1 if there is none. 1452 */ 1453 int 1454 vm_phys_find_range(vm_page_t bounds[], int segind, int domain, 1455 u_long npages, vm_paddr_t low, vm_paddr_t high) 1456 { 1457 vm_paddr_t pa_end, pa_start; 1458 struct vm_phys_seg *end_seg, *seg; 1459 1460 KASSERT(npages > 0, ("npages is zero")); 1461 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range")); 1462 end_seg = &vm_phys_segs[vm_phys_nsegs]; 1463 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) { 1464 if (seg->domain != domain) 1465 continue; 1466 if (seg->start >= high) 1467 return (-1); 1468 pa_start = MAX(low, seg->start); 1469 pa_end = MIN(high, seg->end); 1470 if (pa_end - pa_start < ptoa(npages)) 1471 continue; 1472 #ifdef VM_FREEPOOL_LAZYINIT 1473 /* 1474 * The pages on the free lists must be initialized. 1475 */ 1476 vm_phys_lazy_init_domain(domain, false); 1477 #endif 1478 bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start); 1479 bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; 1480 return (seg - vm_phys_segs); 1481 } 1482 return (-1); 1483 } 1484 1485 /* 1486 * Search for the given physical page "m" in the free lists. If the search 1487 * succeeds, remove "m" from the free lists and return true. Otherwise, return 1488 * false, indicating that "m" is not in the free lists. 1489 * 1490 * The free page queues must be locked. 1491 */ 1492 bool 1493 vm_phys_unfree_page(vm_paddr_t pa) 1494 { 1495 struct vm_freelist *fl; 1496 struct vm_phys_seg *seg; 1497 vm_paddr_t pa_half; 1498 vm_page_t m, m_set, m_tmp; 1499 int order, pool; 1500 1501 seg = vm_phys_paddr_to_seg(pa); 1502 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1503 1504 #ifdef VM_FREEPOOL_LAZYINIT 1505 /* 1506 * The pages on the free lists must be initialized. 1507 */ 1508 vm_phys_lazy_init_domain(seg->domain, true); 1509 #endif 1510 1511 /* 1512 * First, find the contiguous, power of two-sized set of free 1513 * physical pages containing the given physical page "m" and 1514 * assign it to "m_set". 1515 */ 1516 m = vm_phys_paddr_to_vm_page(pa); 1517 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1518 order < VM_NFREEORDER - 1; ) { 1519 order++; 1520 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1521 if (pa >= seg->start) 1522 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa); 1523 else 1524 return (false); 1525 } 1526 if (m_set->order < order) 1527 return (false); 1528 if (m_set->order == VM_NFREEORDER) 1529 return (false); 1530 KASSERT(m_set->order < VM_NFREEORDER, 1531 ("vm_phys_unfree_page: page %p has unexpected order %d", 1532 m_set, m_set->order)); 1533 1534 /* 1535 * Next, remove "m_set" from the free lists. Finally, extract 1536 * "m" from "m_set" using an iterative algorithm: While "m_set" 1537 * is larger than a page, shrink "m_set" by returning the half 1538 * of "m_set" that does not contain "m" to the free lists. 1539 */ 1540 pool = m_set->pool; 1541 fl = (*seg->free_queues)[pool]; 1542 order = m_set->order; 1543 vm_freelist_rem(fl, m_set, order); 1544 while (order > 0) { 1545 order--; 1546 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1547 if (m->phys_addr < pa_half) 1548 m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1549 else { 1550 m_tmp = m_set; 1551 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1552 } 1553 vm_freelist_add(fl, m_tmp, order, pool, 0); 1554 } 1555 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1556 return (true); 1557 } 1558 1559 /* 1560 * Find a run of contiguous physical pages, meeting alignment requirements, from 1561 * a list of max-sized page blocks, where we need at least two consecutive 1562 * blocks to satisfy the (large) page request. 1563 */ 1564 static vm_page_t 1565 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages, 1566 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1567 { 1568 struct vm_phys_seg *seg; 1569 vm_page_t m, m_iter, m_ret; 1570 vm_paddr_t max_size, size; 1571 int max_order; 1572 1573 max_order = VM_NFREEORDER - 1; 1574 size = npages << PAGE_SHIFT; 1575 max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order); 1576 KASSERT(size > max_size, ("size is too small")); 1577 1578 /* 1579 * In order to avoid examining any free max-sized page block more than 1580 * twice, identify the ones that are first in a physically-contiguous 1581 * sequence of such blocks, and only for those walk the sequence to 1582 * check if there are enough free blocks starting at a properly aligned 1583 * block. Thus, no block is checked for free-ness more than twice. 1584 */ 1585 TAILQ_FOREACH(m, &fl[max_order].pl, listq) { 1586 /* 1587 * Skip m unless it is first in a sequence of free max page 1588 * blocks >= low in its segment. 1589 */ 1590 seg = &vm_phys_segs[m->segind]; 1591 if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start)) 1592 continue; 1593 if (VM_PAGE_TO_PHYS(m) >= max_size && 1594 VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) && 1595 max_order == m[-1 << max_order].order) 1596 continue; 1597 1598 /* 1599 * Advance m_ret from m to the first of the sequence, if any, 1600 * that satisfies alignment conditions and might leave enough 1601 * space. 1602 */ 1603 m_ret = m; 1604 while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret), 1605 size, alignment, boundary) && 1606 VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) && 1607 max_order == m_ret[1 << max_order].order) 1608 m_ret += 1 << max_order; 1609 1610 /* 1611 * Skip m unless some block m_ret in the sequence is properly 1612 * aligned, and begins a sequence of enough pages less than 1613 * high, and in the same segment. 1614 */ 1615 if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end)) 1616 continue; 1617 1618 /* 1619 * Skip m unless the blocks to allocate starting at m_ret are 1620 * all free. 1621 */ 1622 for (m_iter = m_ret; 1623 m_iter < m_ret + npages && max_order == m_iter->order; 1624 m_iter += 1 << max_order) { 1625 } 1626 if (m_iter < m_ret + npages) 1627 continue; 1628 return (m_ret); 1629 } 1630 return (NULL); 1631 } 1632 1633 /* 1634 * Find a run of contiguous physical pages from the specified free list 1635 * table. 1636 */ 1637 static vm_page_t 1638 vm_phys_find_queues_contig( 1639 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], 1640 u_long npages, vm_paddr_t low, vm_paddr_t high, 1641 u_long alignment, vm_paddr_t boundary) 1642 { 1643 struct vm_freelist *fl; 1644 vm_page_t m_ret; 1645 vm_paddr_t pa, pa_end, size; 1646 int oind, order, pind; 1647 1648 KASSERT(npages > 0, ("npages is 0")); 1649 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1650 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1651 /* Compute the queue that is the best fit for npages. */ 1652 order = flsl(npages - 1); 1653 /* Search for a large enough free block. */ 1654 size = npages << PAGE_SHIFT; 1655 for (oind = order; oind < VM_NFREEORDER; oind++) { 1656 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1657 fl = (*queues)[pind]; 1658 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1659 /* 1660 * Determine if the address range starting at pa 1661 * is within the given range, satisfies the 1662 * given alignment, and does not cross the given 1663 * boundary. 1664 */ 1665 pa = VM_PAGE_TO_PHYS(m_ret); 1666 pa_end = pa + size; 1667 if (low <= pa && pa_end <= high && 1668 vm_addr_ok(pa, size, alignment, boundary)) 1669 return (m_ret); 1670 } 1671 } 1672 } 1673 if (order < VM_NFREEORDER) 1674 return (NULL); 1675 /* Search for a long-enough sequence of max-order blocks. */ 1676 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1677 fl = (*queues)[pind]; 1678 m_ret = vm_phys_find_freelist_contig(fl, npages, 1679 low, high, alignment, boundary); 1680 if (m_ret != NULL) 1681 return (m_ret); 1682 } 1683 return (NULL); 1684 } 1685 1686 /* 1687 * Allocate a contiguous set of physical pages of the given size 1688 * "npages" from the free lists. All of the physical pages must be at 1689 * or above the given physical address "low" and below the given 1690 * physical address "high". The given value "alignment" determines the 1691 * alignment of the first physical page in the set. If the given value 1692 * "boundary" is non-zero, then the set of physical pages cannot cross 1693 * any physical address boundary that is a multiple of that value. Both 1694 * "alignment" and "boundary" must be a power of two. Sets the pool 1695 * field to DEFAULT in the first allocated page. 1696 */ 1697 vm_page_t 1698 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1699 u_long alignment, vm_paddr_t boundary) 1700 { 1701 vm_paddr_t pa_end, pa_start; 1702 struct vm_freelist *fl; 1703 vm_page_t m, m_run; 1704 struct vm_phys_seg *seg; 1705 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; 1706 int oind, segind; 1707 1708 KASSERT(npages > 0, ("npages is 0")); 1709 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1710 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1711 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1712 if (low >= high) 1713 return (NULL); 1714 queues = NULL; 1715 m_run = NULL; 1716 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1717 seg = &vm_phys_segs[segind]; 1718 if (seg->start >= high || seg->domain != domain) 1719 continue; 1720 if (low >= seg->end) 1721 break; 1722 if (low <= seg->start) 1723 pa_start = seg->start; 1724 else 1725 pa_start = low; 1726 if (high < seg->end) 1727 pa_end = high; 1728 else 1729 pa_end = seg->end; 1730 if (pa_end - pa_start < ptoa(npages)) 1731 continue; 1732 /* 1733 * If a previous segment led to a search using 1734 * the same free lists as would this segment, then 1735 * we've actually already searched within this 1736 * too. So skip it. 1737 */ 1738 if (seg->free_queues == queues) 1739 continue; 1740 queues = seg->free_queues; 1741 m_run = vm_phys_find_queues_contig(queues, npages, 1742 low, high, alignment, boundary); 1743 if (m_run != NULL) 1744 break; 1745 } 1746 if (m_run == NULL) 1747 return (NULL); 1748 1749 /* Allocate pages from the page-range found. */ 1750 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { 1751 fl = (*queues)[m->pool]; 1752 oind = m->order; 1753 vm_freelist_rem(fl, m, oind); 1754 vm_phys_finish_init(m, oind); 1755 } 1756 /* Return excess pages to the free lists. */ 1757 fl = (*queues)[VM_FREEPOOL_DEFAULT]; 1758 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 1759 VM_FREEPOOL_DEFAULT, 0); 1760 1761 /* Return page verified to satisfy conditions of request. */ 1762 pa_start = VM_PAGE_TO_PHYS(m_run); 1763 KASSERT(low <= pa_start, 1764 ("memory allocated below minimum requested range")); 1765 KASSERT(pa_start + ptoa(npages) <= high, 1766 ("memory allocated above maximum requested range")); 1767 seg = &vm_phys_segs[m_run->segind]; 1768 KASSERT(seg->domain == domain, 1769 ("memory not allocated from specified domain")); 1770 KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary), 1771 ("memory alignment/boundary constraints not satisfied")); 1772 return (m_run); 1773 } 1774 1775 /* 1776 * Return the index of the first unused slot which may be the terminating 1777 * entry. 1778 */ 1779 static int 1780 vm_phys_avail_count(void) 1781 { 1782 int i; 1783 1784 for (i = 0; i < PHYS_AVAIL_COUNT; i += 2) 1785 if (phys_avail[i] == 0 && phys_avail[i + 1] == 0) 1786 return (i); 1787 panic("Improperly terminated phys_avail[]"); 1788 } 1789 1790 /* 1791 * Assert that a phys_avail entry is valid. 1792 */ 1793 static void 1794 vm_phys_avail_check(int i) 1795 { 1796 if (i % 2 != 0) 1797 panic("Chunk start index %d is not even.", i); 1798 if (phys_avail[i] & PAGE_MASK) 1799 panic("Unaligned phys_avail[%d]: %#jx", i, 1800 (intmax_t)phys_avail[i]); 1801 if (phys_avail[i + 1] & PAGE_MASK) 1802 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1803 (intmax_t)phys_avail[i + 1]); 1804 if (phys_avail[i + 1] < phys_avail[i]) 1805 panic("phys_avail[%d]: start %#jx > end %#jx", i, 1806 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i + 1]); 1807 } 1808 1809 /* 1810 * Return the index of an overlapping phys_avail entry or -1. 1811 */ 1812 #ifdef NUMA 1813 static int 1814 vm_phys_avail_find(vm_paddr_t pa) 1815 { 1816 int i; 1817 1818 for (i = 0; phys_avail[i + 1]; i += 2) 1819 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1820 return (i); 1821 return (-1); 1822 } 1823 #endif 1824 1825 /* 1826 * Return the index of the largest entry. 1827 */ 1828 int 1829 vm_phys_avail_largest(void) 1830 { 1831 vm_paddr_t sz, largesz; 1832 int largest; 1833 int i; 1834 1835 largest = 0; 1836 largesz = 0; 1837 for (i = 0; phys_avail[i + 1]; i += 2) { 1838 sz = vm_phys_avail_size(i); 1839 if (sz > largesz) { 1840 largesz = sz; 1841 largest = i; 1842 } 1843 } 1844 1845 return (largest); 1846 } 1847 1848 vm_paddr_t 1849 vm_phys_avail_size(int i) 1850 { 1851 1852 return (phys_avail[i + 1] - phys_avail[i]); 1853 } 1854 1855 /* 1856 * Split a chunk in phys_avail[] at the address 'pa'. 1857 * 1858 * 'pa' must be within a chunk (slots i and i + 1) or one of its boundaries. 1859 * Returns zero on actual split, in which case the two new chunks occupy slots 1860 * i to i + 3, else EJUSTRETURN if 'pa' was one of the boundaries (and no split 1861 * actually occurred) else ENOSPC if there are not enough slots in phys_avail[] 1862 * to represent the additional chunk caused by the split. 1863 */ 1864 static int 1865 vm_phys_avail_split(vm_paddr_t pa, int i) 1866 { 1867 int cnt; 1868 1869 vm_phys_avail_check(i); 1870 if (pa < phys_avail[i] || pa > phys_avail[i + 1]) 1871 panic("%s: Address %#jx not in range at slot %d [%#jx;%#jx].", 1872 __func__, (uintmax_t)pa, i, 1873 (uintmax_t)phys_avail[i], (uintmax_t)phys_avail[i + 1]); 1874 if (pa == phys_avail[i] || pa == phys_avail[i + 1]) 1875 return (EJUSTRETURN); 1876 cnt = vm_phys_avail_count(); 1877 if (cnt >= PHYS_AVAIL_ENTRIES) 1878 return (ENOSPC); 1879 memmove(&phys_avail[i + 2], &phys_avail[i], 1880 (cnt - i) * sizeof(phys_avail[0])); 1881 phys_avail[i + 1] = pa; 1882 phys_avail[i + 2] = pa; 1883 vm_phys_avail_check(i); 1884 vm_phys_avail_check(i+2); 1885 1886 return (0); 1887 } 1888 1889 /* 1890 * Check if a given physical address can be included as part of a crash dump. 1891 */ 1892 bool 1893 vm_phys_is_dumpable(vm_paddr_t pa) 1894 { 1895 vm_page_t m; 1896 int i; 1897 1898 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1899 return ((m->flags & PG_NODUMP) == 0); 1900 1901 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1902 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1903 return (true); 1904 } 1905 return (false); 1906 } 1907 1908 void 1909 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1910 { 1911 struct vm_phys_seg *seg; 1912 1913 if (vm_phys_early_nsegs == -1) 1914 panic("%s: called after initialization", __func__); 1915 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1916 panic("%s: ran out of early segments", __func__); 1917 1918 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1919 seg->start = start; 1920 seg->end = end; 1921 } 1922 1923 /* 1924 * This routine allocates NUMA node specific memory before the page 1925 * allocator is bootstrapped. 1926 */ 1927 vm_paddr_t 1928 vm_phys_early_alloc(int domain, size_t alloc_size) 1929 { 1930 #ifdef NUMA 1931 int mem_index; 1932 #endif 1933 int i, biggestone; 1934 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1935 1936 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1937 ("%s: invalid domain index %d", __func__, domain)); 1938 1939 /* 1940 * Search the mem_affinity array for the biggest address 1941 * range in the desired domain. This is used to constrain 1942 * the phys_avail selection below. 1943 */ 1944 biggestsize = 0; 1945 mem_start = 0; 1946 mem_end = -1; 1947 #ifdef NUMA 1948 mem_index = 0; 1949 if (mem_affinity != NULL) { 1950 for (i = 0;; i++) { 1951 size = mem_affinity[i].end - mem_affinity[i].start; 1952 if (size == 0) 1953 break; 1954 if (domain != -1 && mem_affinity[i].domain != domain) 1955 continue; 1956 if (size > biggestsize) { 1957 mem_index = i; 1958 biggestsize = size; 1959 } 1960 } 1961 mem_start = mem_affinity[mem_index].start; 1962 mem_end = mem_affinity[mem_index].end; 1963 } 1964 #endif 1965 1966 /* 1967 * Now find biggest physical segment in within the desired 1968 * numa domain. 1969 */ 1970 biggestsize = 0; 1971 biggestone = 0; 1972 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1973 /* skip regions that are out of range */ 1974 if (phys_avail[i+1] - alloc_size < mem_start || 1975 phys_avail[i+1] > mem_end) 1976 continue; 1977 size = vm_phys_avail_size(i); 1978 if (size > biggestsize) { 1979 biggestone = i; 1980 biggestsize = size; 1981 } 1982 } 1983 alloc_size = round_page(alloc_size); 1984 1985 /* 1986 * Grab single pages from the front to reduce fragmentation. 1987 */ 1988 if (alloc_size == PAGE_SIZE) { 1989 pa = phys_avail[biggestone]; 1990 phys_avail[biggestone] += PAGE_SIZE; 1991 vm_phys_avail_check(biggestone); 1992 return (pa); 1993 } 1994 1995 /* 1996 * Naturally align large allocations. 1997 */ 1998 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1999 if (alloc_size + align > biggestsize) 2000 panic("cannot find a large enough size\n"); 2001 if (align != 0 && 2002 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 2003 biggestone) != 0) 2004 /* Wasting memory. */ 2005 phys_avail[biggestone + 1] -= align; 2006 2007 phys_avail[biggestone + 1] -= alloc_size; 2008 vm_phys_avail_check(biggestone); 2009 pa = phys_avail[biggestone + 1]; 2010 return (pa); 2011 } 2012 2013 void 2014 vm_phys_early_startup(void) 2015 { 2016 struct vm_phys_seg *seg; 2017 int i; 2018 2019 if (phys_avail[1] == 0) 2020 panic("phys_avail[] is empty"); 2021 2022 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 2023 phys_avail[i] = round_page(phys_avail[i]); 2024 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 2025 } 2026 2027 for (i = 0; i < vm_phys_early_nsegs; i++) { 2028 seg = &vm_phys_early_segs[i]; 2029 vm_phys_add_seg(seg->start, seg->end); 2030 } 2031 vm_phys_early_nsegs = -1; 2032 2033 #ifdef NUMA 2034 /* Force phys_avail to be split by domain. */ 2035 if (mem_affinity != NULL) { 2036 int idx; 2037 2038 for (i = 0; mem_affinity[i].end != 0; i++) { 2039 idx = vm_phys_avail_find(mem_affinity[i].start); 2040 if (idx != -1) 2041 vm_phys_avail_split(mem_affinity[i].start, idx); 2042 idx = vm_phys_avail_find(mem_affinity[i].end); 2043 if (idx != -1) 2044 vm_phys_avail_split(mem_affinity[i].end, idx); 2045 } 2046 } 2047 #endif 2048 } 2049 2050 #ifdef DDB 2051 /* 2052 * Show the number of physical pages in each of the free lists. 2053 */ 2054 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) 2055 { 2056 struct vm_freelist *fl; 2057 int flind, oind, pind, dom; 2058 2059 for (dom = 0; dom < vm_ndomains; dom++) { 2060 db_printf("DOMAIN: %d\n", dom); 2061 for (flind = 0; flind < vm_nfreelists; flind++) { 2062 db_printf("FREE LIST %d:\n" 2063 "\n ORDER (SIZE) | NUMBER" 2064 "\n ", flind); 2065 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2066 db_printf(" | POOL %d", pind); 2067 db_printf("\n-- "); 2068 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2069 db_printf("-- -- "); 2070 db_printf("--\n"); 2071 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 2072 db_printf(" %2.2d (%6.6dK)", oind, 2073 1 << (PAGE_SHIFT - 10 + oind)); 2074 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 2075 fl = vm_phys_free_queues[dom][flind][pind]; 2076 db_printf(" | %6.6d", fl[oind].lcnt); 2077 } 2078 db_printf("\n"); 2079 } 2080 db_printf("\n"); 2081 } 2082 db_printf("\n"); 2083 } 2084 } 2085 #endif 2086