1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_vm.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/domainset.h> 48 #include <sys/lock.h> 49 #include <sys/kernel.h> 50 #include <sys/kthread.h> 51 #include <sys/malloc.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/queue.h> 55 #include <sys/rwlock.h> 56 #include <sys/sbuf.h> 57 #include <sys/sched.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/tslog.h> 61 #include <sys/unistd.h> 62 #include <sys/vmmeter.h> 63 64 #include <ddb/ddb.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_extern.h> 68 #include <vm/vm_param.h> 69 #include <vm/vm_kern.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_phys.h> 73 #include <vm/vm_pagequeue.h> 74 75 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 76 "Too many physsegs."); 77 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t), 78 "vm_paddr_t too big for ffsll, flsll."); 79 80 #ifdef NUMA 81 struct mem_affinity __read_mostly *mem_affinity; 82 int __read_mostly *mem_locality; 83 84 static int numa_disabled; 85 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 86 "NUMA options"); 87 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 88 &numa_disabled, 0, "NUMA-awareness in the allocators is disabled"); 89 #endif 90 91 int __read_mostly vm_ndomains = 1; 92 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 93 94 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 95 int __read_mostly vm_phys_nsegs; 96 static struct vm_phys_seg vm_phys_early_segs[8]; 97 static int vm_phys_early_nsegs; 98 99 struct vm_phys_fictitious_seg; 100 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 101 struct vm_phys_fictitious_seg *); 102 103 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 104 RB_INITIALIZER(&vm_phys_fictitious_tree); 105 106 struct vm_phys_fictitious_seg { 107 RB_ENTRY(vm_phys_fictitious_seg) node; 108 /* Memory region data */ 109 vm_paddr_t start; 110 vm_paddr_t end; 111 vm_page_t first_page; 112 }; 113 114 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 115 vm_phys_fictitious_cmp); 116 117 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 118 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 119 120 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 121 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 122 [VM_NFREEORDER_MAX]; 123 124 static int __read_mostly vm_nfreelists; 125 126 /* 127 * These "avail lists" are globals used to communicate boot-time physical 128 * memory layout to other parts of the kernel. Each physically contiguous 129 * region of memory is defined by a start address at an even index and an 130 * end address at the following odd index. Each list is terminated by a 131 * pair of zero entries. 132 * 133 * dump_avail tells the dump code what regions to include in a crash dump, and 134 * phys_avail is all of the remaining physical memory that is available for 135 * the vm system. 136 * 137 * Initially dump_avail and phys_avail are identical. Boot time memory 138 * allocations remove extents from phys_avail that may still be included 139 * in dumps. 140 */ 141 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 142 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 143 144 /* 145 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 146 */ 147 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 148 static int __read_mostly vm_default_freepool; 149 150 CTASSERT(VM_FREELIST_DEFAULT == 0); 151 152 #ifdef VM_FREELIST_DMA32 153 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 154 #endif 155 156 /* 157 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 158 * the ordering of the free list boundaries. 159 */ 160 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 161 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 162 #endif 163 164 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 165 SYSCTL_OID(_vm, OID_AUTO, phys_free, 166 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 167 sysctl_vm_phys_free, "A", 168 "Phys Free Info"); 169 170 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 171 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 172 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 173 sysctl_vm_phys_segs, "A", 174 "Phys Seg Info"); 175 176 #ifdef NUMA 177 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 178 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 179 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 180 sysctl_vm_phys_locality, "A", 181 "Phys Locality Info"); 182 #endif 183 184 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 185 &vm_ndomains, 0, "Number of physical memory domains available."); 186 187 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 188 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 189 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 190 int order, int tail); 191 192 static bool __diagused 193 vm_phys_pool_valid(int pool) 194 { 195 #ifdef VM_FREEPOOL_LAZYINIT 196 if (pool == VM_FREEPOOL_LAZYINIT) 197 return (false); 198 #endif 199 return (pool >= 0 && pool < VM_NFREEPOOL); 200 } 201 202 /* 203 * Red-black tree helpers for vm fictitious range management. 204 */ 205 static inline int 206 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 207 struct vm_phys_fictitious_seg *range) 208 { 209 210 KASSERT(range->start != 0 && range->end != 0, 211 ("Invalid range passed on search for vm_fictitious page")); 212 if (p->start >= range->end) 213 return (1); 214 if (p->start < range->start) 215 return (-1); 216 217 return (0); 218 } 219 220 static int 221 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 222 struct vm_phys_fictitious_seg *p2) 223 { 224 225 /* Check if this is a search for a page */ 226 if (p1->end == 0) 227 return (vm_phys_fictitious_in_range(p1, p2)); 228 229 KASSERT(p2->end != 0, 230 ("Invalid range passed as second parameter to vm fictitious comparison")); 231 232 /* Searching to add a new range */ 233 if (p1->end <= p2->start) 234 return (-1); 235 if (p1->start >= p2->end) 236 return (1); 237 238 panic("Trying to add overlapping vm fictitious ranges:\n" 239 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 240 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 241 } 242 243 int 244 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used, 245 vm_paddr_t high __numa_used) 246 { 247 #ifdef NUMA 248 domainset_t mask; 249 int i; 250 251 if (vm_ndomains == 1 || mem_affinity == NULL) 252 return (0); 253 254 DOMAINSET_ZERO(&mask); 255 /* 256 * Check for any memory that overlaps low, high. 257 */ 258 for (i = 0; mem_affinity[i].end != 0; i++) 259 if (mem_affinity[i].start <= high && 260 mem_affinity[i].end >= low) 261 DOMAINSET_SET(mem_affinity[i].domain, &mask); 262 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 263 return (prefer); 264 if (DOMAINSET_EMPTY(&mask)) 265 panic("vm_phys_domain_match: Impossible constraint"); 266 return (DOMAINSET_FFS(&mask) - 1); 267 #else 268 return (0); 269 #endif 270 } 271 272 /* 273 * Outputs the state of the physical memory allocator, specifically, 274 * the amount of physical memory in each free list. 275 */ 276 static int 277 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 278 { 279 struct sbuf sbuf; 280 struct vm_freelist *fl; 281 int dom, error, flind, oind, pind; 282 283 error = sysctl_wire_old_buffer(req, 0); 284 if (error != 0) 285 return (error); 286 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 287 for (dom = 0; dom < vm_ndomains; dom++) { 288 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 289 for (flind = 0; flind < vm_nfreelists; flind++) { 290 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 291 "\n ORDER (SIZE) | NUMBER" 292 "\n ", flind); 293 for (pind = 0; pind < VM_NFREEPOOL; pind++) 294 sbuf_printf(&sbuf, " | POOL %d", pind); 295 sbuf_printf(&sbuf, "\n-- "); 296 for (pind = 0; pind < VM_NFREEPOOL; pind++) 297 sbuf_printf(&sbuf, "-- -- "); 298 sbuf_printf(&sbuf, "--\n"); 299 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 300 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 301 1 << (PAGE_SHIFT - 10 + oind)); 302 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 303 fl = vm_phys_free_queues[dom][flind][pind]; 304 sbuf_printf(&sbuf, " | %6d", 305 fl[oind].lcnt); 306 } 307 sbuf_printf(&sbuf, "\n"); 308 } 309 } 310 } 311 error = sbuf_finish(&sbuf); 312 sbuf_delete(&sbuf); 313 return (error); 314 } 315 316 /* 317 * Outputs the set of physical memory segments. 318 */ 319 static int 320 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 321 { 322 struct sbuf sbuf; 323 struct vm_phys_seg *seg; 324 int error, segind; 325 326 error = sysctl_wire_old_buffer(req, 0); 327 if (error != 0) 328 return (error); 329 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 330 for (segind = 0; segind < vm_phys_nsegs; segind++) { 331 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 332 seg = &vm_phys_segs[segind]; 333 sbuf_printf(&sbuf, "start: %#jx\n", 334 (uintmax_t)seg->start); 335 sbuf_printf(&sbuf, "end: %#jx\n", 336 (uintmax_t)seg->end); 337 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 338 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 339 } 340 error = sbuf_finish(&sbuf); 341 sbuf_delete(&sbuf); 342 return (error); 343 } 344 345 /* 346 * Return affinity, or -1 if there's no affinity information. 347 */ 348 int 349 vm_phys_mem_affinity(int f __numa_used, int t __numa_used) 350 { 351 352 #ifdef NUMA 353 if (mem_locality == NULL) 354 return (-1); 355 if (f >= vm_ndomains || t >= vm_ndomains) 356 return (-1); 357 return (mem_locality[f * vm_ndomains + t]); 358 #else 359 return (-1); 360 #endif 361 } 362 363 #ifdef NUMA 364 /* 365 * Outputs the VM locality table. 366 */ 367 static int 368 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 369 { 370 struct sbuf sbuf; 371 int error, i, j; 372 373 error = sysctl_wire_old_buffer(req, 0); 374 if (error != 0) 375 return (error); 376 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 377 378 sbuf_printf(&sbuf, "\n"); 379 380 for (i = 0; i < vm_ndomains; i++) { 381 sbuf_printf(&sbuf, "%d: ", i); 382 for (j = 0; j < vm_ndomains; j++) { 383 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 384 } 385 sbuf_printf(&sbuf, "\n"); 386 } 387 error = sbuf_finish(&sbuf); 388 sbuf_delete(&sbuf); 389 return (error); 390 } 391 #endif 392 393 static void 394 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 395 { 396 397 m->order = order; 398 if (tail) 399 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 400 else 401 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 402 fl[order].lcnt++; 403 } 404 405 static void 406 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 407 { 408 409 TAILQ_REMOVE(&fl[order].pl, m, listq); 410 fl[order].lcnt--; 411 m->order = VM_NFREEORDER; 412 } 413 414 /* 415 * Create a physical memory segment. 416 */ 417 static void 418 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 419 { 420 struct vm_phys_seg *seg; 421 422 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 423 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 424 KASSERT(domain >= 0 && domain < vm_ndomains, 425 ("vm_phys_create_seg: invalid domain provided")); 426 seg = &vm_phys_segs[vm_phys_nsegs++]; 427 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 428 *seg = *(seg - 1); 429 seg--; 430 } 431 seg->start = start; 432 seg->end = end; 433 seg->domain = domain; 434 } 435 436 static void 437 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 438 { 439 #ifdef NUMA 440 int i; 441 442 if (mem_affinity == NULL) { 443 _vm_phys_create_seg(start, end, 0); 444 return; 445 } 446 447 for (i = 0;; i++) { 448 if (mem_affinity[i].end == 0) 449 panic("Reached end of affinity info"); 450 if (mem_affinity[i].end <= start) 451 continue; 452 if (mem_affinity[i].start > start) 453 panic("No affinity info for start %jx", 454 (uintmax_t)start); 455 if (mem_affinity[i].end >= end) { 456 _vm_phys_create_seg(start, end, 457 mem_affinity[i].domain); 458 break; 459 } 460 _vm_phys_create_seg(start, mem_affinity[i].end, 461 mem_affinity[i].domain); 462 start = mem_affinity[i].end; 463 } 464 #else 465 _vm_phys_create_seg(start, end, 0); 466 #endif 467 } 468 469 /* 470 * Add a physical memory segment. 471 */ 472 void 473 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 474 { 475 vm_paddr_t paddr; 476 477 KASSERT((start & PAGE_MASK) == 0, 478 ("vm_phys_define_seg: start is not page aligned")); 479 KASSERT((end & PAGE_MASK) == 0, 480 ("vm_phys_define_seg: end is not page aligned")); 481 482 /* 483 * Split the physical memory segment if it spans two or more free 484 * list boundaries. 485 */ 486 paddr = start; 487 #ifdef VM_FREELIST_LOWMEM 488 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 489 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 490 paddr = VM_LOWMEM_BOUNDARY; 491 } 492 #endif 493 #ifdef VM_FREELIST_DMA32 494 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 495 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 496 paddr = VM_DMA32_BOUNDARY; 497 } 498 #endif 499 vm_phys_create_seg(paddr, end); 500 } 501 502 /* 503 * Initialize the physical memory allocator. 504 * 505 * Requires that vm_page_array is initialized! 506 */ 507 void 508 vm_phys_init(void) 509 { 510 struct vm_freelist *fl; 511 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 512 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) 513 u_long npages; 514 #endif 515 int dom, flind, freelist, oind, pind, segind; 516 517 /* 518 * Compute the number of free lists, and generate the mapping from the 519 * manifest constants VM_FREELIST_* to the free list indices. 520 * 521 * Initially, the entries of vm_freelist_to_flind[] are set to either 522 * 0 or 1 to indicate which free lists should be created. 523 */ 524 #ifdef VM_DMA32_NPAGES_THRESHOLD 525 npages = 0; 526 #endif 527 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 528 seg = &vm_phys_segs[segind]; 529 #ifdef VM_FREELIST_LOWMEM 530 if (seg->end <= VM_LOWMEM_BOUNDARY) 531 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 532 else 533 #endif 534 #ifdef VM_FREELIST_DMA32 535 if ( 536 #ifdef VM_DMA32_NPAGES_THRESHOLD 537 /* 538 * Create the DMA32 free list only if the amount of 539 * physical memory above physical address 4G exceeds the 540 * given threshold. 541 */ 542 npages > VM_DMA32_NPAGES_THRESHOLD && 543 #endif 544 seg->end <= VM_DMA32_BOUNDARY) 545 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 546 else 547 #endif 548 { 549 #ifdef VM_DMA32_NPAGES_THRESHOLD 550 npages += atop(seg->end - seg->start); 551 #endif 552 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 553 } 554 } 555 /* Change each entry into a running total of the free lists. */ 556 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 557 vm_freelist_to_flind[freelist] += 558 vm_freelist_to_flind[freelist - 1]; 559 } 560 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 561 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 562 /* Change each entry into a free list index. */ 563 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 564 vm_freelist_to_flind[freelist]--; 565 566 /* 567 * Initialize the first_page and free_queues fields of each physical 568 * memory segment. 569 */ 570 #ifdef VM_PHYSSEG_SPARSE 571 npages = 0; 572 #endif 573 for (segind = 0; segind < vm_phys_nsegs; segind++) { 574 seg = &vm_phys_segs[segind]; 575 #ifdef VM_PHYSSEG_SPARSE 576 seg->first_page = &vm_page_array[npages]; 577 npages += atop(seg->end - seg->start); 578 #else 579 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 580 #endif 581 #ifdef VM_FREELIST_LOWMEM 582 if (seg->end <= VM_LOWMEM_BOUNDARY) { 583 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 584 KASSERT(flind >= 0, 585 ("vm_phys_init: LOWMEM flind < 0")); 586 } else 587 #endif 588 #ifdef VM_FREELIST_DMA32 589 if (seg->end <= VM_DMA32_BOUNDARY) { 590 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 591 KASSERT(flind >= 0, 592 ("vm_phys_init: DMA32 flind < 0")); 593 } else 594 #endif 595 { 596 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 597 KASSERT(flind >= 0, 598 ("vm_phys_init: DEFAULT flind < 0")); 599 } 600 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 601 } 602 603 /* 604 * Coalesce physical memory segments that are contiguous and share the 605 * same per-domain free queues. 606 */ 607 prev_seg = vm_phys_segs; 608 seg = &vm_phys_segs[1]; 609 end_seg = &vm_phys_segs[vm_phys_nsegs]; 610 while (seg < end_seg) { 611 if (prev_seg->end == seg->start && 612 prev_seg->free_queues == seg->free_queues) { 613 prev_seg->end = seg->end; 614 KASSERT(prev_seg->domain == seg->domain, 615 ("vm_phys_init: free queues cannot span domains")); 616 vm_phys_nsegs--; 617 end_seg--; 618 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 619 *tmp_seg = *(tmp_seg + 1); 620 } else { 621 prev_seg = seg; 622 seg++; 623 } 624 } 625 626 /* 627 * Initialize the free queues. 628 */ 629 for (dom = 0; dom < vm_ndomains; dom++) { 630 for (flind = 0; flind < vm_nfreelists; flind++) { 631 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 632 fl = vm_phys_free_queues[dom][flind][pind]; 633 for (oind = 0; oind < VM_NFREEORDER; oind++) 634 TAILQ_INIT(&fl[oind].pl); 635 } 636 } 637 } 638 639 #ifdef VM_FREEPOOL_LAZYINIT 640 vm_default_freepool = VM_FREEPOOL_LAZYINIT; 641 #else 642 vm_default_freepool = VM_FREEPOOL_DEFAULT; 643 #endif 644 645 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 646 } 647 648 /* 649 * Register info about the NUMA topology of the system. 650 * 651 * Invoked by platform-dependent code prior to vm_phys_init(). 652 */ 653 void 654 vm_phys_register_domains(int ndomains __numa_used, 655 struct mem_affinity *affinity __numa_used, int *locality __numa_used) 656 { 657 #ifdef NUMA 658 int i; 659 660 /* 661 * For now the only override value that we support is 1, which 662 * effectively disables NUMA-awareness in the allocators. 663 */ 664 TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled); 665 if (numa_disabled) 666 ndomains = 1; 667 668 if (ndomains > 1) { 669 vm_ndomains = ndomains; 670 mem_affinity = affinity; 671 mem_locality = locality; 672 } 673 674 for (i = 0; i < vm_ndomains; i++) 675 DOMAINSET_SET(i, &all_domains); 676 #endif 677 } 678 679 /* 680 * Split a contiguous, power of two-sized set of physical pages. 681 * 682 * When this function is called by a page allocation function, the caller 683 * should request insertion at the head unless the order [order, oind) queues 684 * are known to be empty. The objective being to reduce the likelihood of 685 * long-term fragmentation by promoting contemporaneous allocation and 686 * (hopefully) deallocation. 687 */ 688 static __inline void 689 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 690 int tail) 691 { 692 vm_page_t m_buddy; 693 694 while (oind > order) { 695 oind--; 696 m_buddy = &m[1 << oind]; 697 KASSERT(m_buddy->order == VM_NFREEORDER, 698 ("vm_phys_split_pages: page %p has unexpected order %d", 699 m_buddy, m_buddy->order)); 700 vm_freelist_add(fl, m_buddy, oind, tail); 701 } 702 } 703 704 static void 705 vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int tail) 706 { 707 KASSERT(order >= 0 && order < VM_NFREEORDER, 708 ("%s: invalid order %d", __func__, order)); 709 710 vm_freelist_add(fl, m, order, tail); 711 #ifdef VM_FREEPOOL_LAZYINIT 712 if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { 713 vm_page_t m_next; 714 vm_paddr_t pa; 715 int npages; 716 717 npages = 1 << order; 718 m_next = m + npages; 719 pa = m->phys_addr + ptoa(npages); 720 if (pa < vm_phys_segs[m->segind].end) { 721 vm_page_init_page(m_next, pa, m->segind, 722 VM_FREEPOOL_LAZYINIT); 723 } 724 } 725 #endif 726 } 727 728 /* 729 * Add the physical pages [m, m + npages) at the beginning of a power-of-two 730 * aligned and sized set to the specified free list. 731 * 732 * When this function is called by a page allocation function, the caller 733 * should request insertion at the head unless the lower-order queues are 734 * known to be empty. The objective being to reduce the likelihood of long- 735 * term fragmentation by promoting contemporaneous allocation and (hopefully) 736 * deallocation. 737 * 738 * The physical page m's buddy must not be free. 739 */ 740 static void 741 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 742 { 743 int order; 744 745 KASSERT(npages == 0 || 746 (VM_PAGE_TO_PHYS(m) & 747 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 748 ("%s: page %p and npages %u are misaligned", 749 __func__, m, npages)); 750 while (npages > 0) { 751 KASSERT(m->order == VM_NFREEORDER, 752 ("%s: page %p has unexpected order %d", 753 __func__, m, m->order)); 754 order = ilog2(npages); 755 KASSERT(order < VM_NFREEORDER, 756 ("%s: order %d is out of range", __func__, order)); 757 vm_phys_enq_chunk(fl, m, order, tail); 758 m += 1 << order; 759 npages -= 1 << order; 760 } 761 } 762 763 /* 764 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 765 * and sized set to the specified free list. 766 * 767 * When this function is called by a page allocation function, the caller 768 * should request insertion at the head unless the lower-order queues are 769 * known to be empty. The objective being to reduce the likelihood of long- 770 * term fragmentation by promoting contemporaneous allocation and (hopefully) 771 * deallocation. 772 * 773 * If npages is zero, this function does nothing and ignores the physical page 774 * parameter m. Otherwise, the physical page m's buddy must not be free. 775 */ 776 static vm_page_t 777 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 778 { 779 int order; 780 781 KASSERT(npages == 0 || 782 ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 783 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 784 ("vm_phys_enq_range: page %p and npages %u are misaligned", 785 m, npages)); 786 while (npages > 0) { 787 KASSERT(m->order == VM_NFREEORDER, 788 ("vm_phys_enq_range: page %p has unexpected order %d", 789 m, m->order)); 790 order = ffs(npages) - 1; 791 vm_phys_enq_chunk(fl, m, order, tail); 792 m += 1 << order; 793 npages -= 1 << order; 794 } 795 return (m); 796 } 797 798 /* 799 * Set the pool for a contiguous, power of two-sized set of physical pages. 800 * 801 * If the pages currently belong to the lazy init pool, then the corresponding 802 * page structures must be initialized. In this case it is assumed that the 803 * first page in the run has already been initialized. 804 */ 805 static void 806 vm_phys_set_pool(int pool, vm_page_t m, int order) 807 { 808 #ifdef VM_FREEPOOL_LAZYINIT 809 if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { 810 vm_paddr_t pa; 811 int segind; 812 813 m->pool = pool; 814 815 TSENTER(); 816 pa = m->phys_addr + PAGE_SIZE; 817 segind = m->segind; 818 for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order]; 819 m_tmp++, pa += PAGE_SIZE) 820 vm_page_init_page(m_tmp, pa, segind, pool); 821 TSEXIT(); 822 } else 823 #endif 824 for (vm_page_t m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 825 m_tmp->pool = pool; 826 } 827 828 /* 829 * Tries to allocate the specified number of pages from the specified pool 830 * within the specified domain. Returns the actual number of allocated pages 831 * and a pointer to each page through the array ma[]. 832 * 833 * The returned pages may not be physically contiguous. However, in contrast 834 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 835 * calling this function once to allocate the desired number of pages will 836 * avoid wasted time in vm_phys_split_pages(). 837 * 838 * The free page queues for the specified domain must be locked. 839 */ 840 int 841 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 842 { 843 struct vm_freelist *alt, *fl; 844 vm_page_t m; 845 int avail, end, flind, freelist, i, oind, pind; 846 847 KASSERT(domain >= 0 && domain < vm_ndomains, 848 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 849 KASSERT(vm_phys_pool_valid(pool), 850 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 851 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 852 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 853 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 854 i = 0; 855 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 856 flind = vm_freelist_to_flind[freelist]; 857 if (flind < 0) 858 continue; 859 fl = vm_phys_free_queues[domain][flind][pool]; 860 for (oind = 0; oind < VM_NFREEORDER; oind++) { 861 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 862 vm_freelist_rem(fl, m, oind); 863 avail = i + (1 << oind); 864 end = imin(npages, avail); 865 while (i < end) 866 ma[i++] = m++; 867 if (i == npages) { 868 /* 869 * Return excess pages to fl. Its order 870 * [0, oind) queues are empty. 871 */ 872 vm_phys_enq_range(m, avail - i, fl, 1); 873 return (npages); 874 } 875 } 876 } 877 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 878 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; 879 pind++) { 880 alt = vm_phys_free_queues[domain][flind][pind]; 881 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 882 NULL) { 883 vm_freelist_rem(alt, m, oind); 884 vm_phys_set_pool(pool, m, oind); 885 avail = i + (1 << oind); 886 end = imin(npages, avail); 887 while (i < end) 888 ma[i++] = m++; 889 if (i == npages) { 890 /* 891 * Return excess pages to fl. 892 * Its order [0, oind) queues 893 * are empty. 894 */ 895 vm_phys_enq_range(m, avail - i, 896 fl, 1); 897 return (npages); 898 } 899 } 900 } 901 } 902 } 903 return (i); 904 } 905 906 /* 907 * Allocate a contiguous, power of two-sized set of physical pages from the 908 * specified free list. The free list must be specified using one of the 909 * manifest constants VM_FREELIST_*. 910 * 911 * The free page queues must be locked. 912 */ 913 static vm_page_t 914 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 915 { 916 struct vm_freelist *alt, *fl; 917 vm_page_t m; 918 int oind, pind, flind; 919 920 KASSERT(domain >= 0 && domain < vm_ndomains, 921 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 922 domain)); 923 KASSERT(freelist < VM_NFREELIST, 924 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 925 freelist)); 926 KASSERT(vm_phys_pool_valid(pool), 927 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 928 KASSERT(order < VM_NFREEORDER, 929 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 930 931 flind = vm_freelist_to_flind[freelist]; 932 /* Check if freelist is present */ 933 if (flind < 0) 934 return (NULL); 935 936 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 937 fl = &vm_phys_free_queues[domain][flind][pool][0]; 938 for (oind = order; oind < VM_NFREEORDER; oind++) { 939 m = TAILQ_FIRST(&fl[oind].pl); 940 if (m != NULL) { 941 vm_freelist_rem(fl, m, oind); 942 /* The order [order, oind) queues are empty. */ 943 vm_phys_split_pages(m, oind, fl, order, 1); 944 return (m); 945 } 946 } 947 948 /* 949 * The given pool was empty. Find the largest 950 * contiguous, power-of-two-sized set of pages in any 951 * pool. Transfer these pages to the given pool, and 952 * use them to satisfy the allocation. 953 */ 954 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 955 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 956 alt = &vm_phys_free_queues[domain][flind][pind][0]; 957 m = TAILQ_FIRST(&alt[oind].pl); 958 if (m != NULL) { 959 vm_freelist_rem(alt, m, oind); 960 vm_phys_set_pool(pool, m, oind); 961 /* The order [order, oind) queues are empty. */ 962 vm_phys_split_pages(m, oind, fl, order, 1); 963 return (m); 964 } 965 } 966 } 967 return (NULL); 968 } 969 970 /* 971 * Allocate a contiguous, power of two-sized set of physical pages 972 * from the free lists. 973 * 974 * The free page queues must be locked. 975 */ 976 vm_page_t 977 vm_phys_alloc_pages(int domain, int pool, int order) 978 { 979 vm_page_t m; 980 int freelist; 981 982 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 983 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 984 if (m != NULL) 985 return (m); 986 } 987 return (NULL); 988 } 989 990 /* 991 * Find the vm_page corresponding to the given physical address, which must lie 992 * within the given physical memory segment. 993 */ 994 vm_page_t 995 vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa) 996 { 997 KASSERT(pa >= seg->start && pa < seg->end, 998 ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa)); 999 1000 return (&seg->first_page[atop(pa - seg->start)]); 1001 } 1002 1003 /* 1004 * Find the vm_page corresponding to the given physical address. 1005 */ 1006 vm_page_t 1007 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 1008 { 1009 struct vm_phys_seg *seg; 1010 1011 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 1012 return (vm_phys_seg_paddr_to_vm_page(seg, pa)); 1013 return (NULL); 1014 } 1015 1016 vm_page_t 1017 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 1018 { 1019 struct vm_phys_fictitious_seg tmp, *seg; 1020 vm_page_t m; 1021 1022 m = NULL; 1023 tmp.start = pa; 1024 tmp.end = 0; 1025 1026 rw_rlock(&vm_phys_fictitious_reg_lock); 1027 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1028 rw_runlock(&vm_phys_fictitious_reg_lock); 1029 if (seg == NULL) 1030 return (NULL); 1031 1032 m = &seg->first_page[atop(pa - seg->start)]; 1033 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 1034 1035 return (m); 1036 } 1037 1038 static inline void 1039 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 1040 long page_count, vm_memattr_t memattr) 1041 { 1042 long i; 1043 1044 bzero(range, page_count * sizeof(*range)); 1045 for (i = 0; i < page_count; i++) { 1046 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 1047 range[i].oflags &= ~VPO_UNMANAGED; 1048 range[i].busy_lock = VPB_UNBUSIED; 1049 } 1050 } 1051 1052 int 1053 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 1054 vm_memattr_t memattr) 1055 { 1056 struct vm_phys_fictitious_seg *seg; 1057 vm_page_t fp; 1058 long page_count; 1059 #ifdef VM_PHYSSEG_DENSE 1060 long pi, pe; 1061 long dpage_count; 1062 #endif 1063 1064 KASSERT(start < end, 1065 ("Start of segment isn't less than end (start: %jx end: %jx)", 1066 (uintmax_t)start, (uintmax_t)end)); 1067 1068 page_count = (end - start) / PAGE_SIZE; 1069 1070 #ifdef VM_PHYSSEG_DENSE 1071 pi = atop(start); 1072 pe = atop(end); 1073 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1074 fp = &vm_page_array[pi - first_page]; 1075 if ((pe - first_page) > vm_page_array_size) { 1076 /* 1077 * We have a segment that starts inside 1078 * of vm_page_array, but ends outside of it. 1079 * 1080 * Use vm_page_array pages for those that are 1081 * inside of the vm_page_array range, and 1082 * allocate the remaining ones. 1083 */ 1084 dpage_count = vm_page_array_size - (pi - first_page); 1085 vm_phys_fictitious_init_range(fp, start, dpage_count, 1086 memattr); 1087 page_count -= dpage_count; 1088 start += ptoa(dpage_count); 1089 goto alloc; 1090 } 1091 /* 1092 * We can allocate the full range from vm_page_array, 1093 * so there's no need to register the range in the tree. 1094 */ 1095 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1096 return (0); 1097 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1098 /* 1099 * We have a segment that ends inside of vm_page_array, 1100 * but starts outside of it. 1101 */ 1102 fp = &vm_page_array[0]; 1103 dpage_count = pe - first_page; 1104 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 1105 memattr); 1106 end -= ptoa(dpage_count); 1107 page_count -= dpage_count; 1108 goto alloc; 1109 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1110 /* 1111 * Trying to register a fictitious range that expands before 1112 * and after vm_page_array. 1113 */ 1114 return (EINVAL); 1115 } else { 1116 alloc: 1117 #endif 1118 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1119 M_WAITOK); 1120 #ifdef VM_PHYSSEG_DENSE 1121 } 1122 #endif 1123 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1124 1125 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1126 seg->start = start; 1127 seg->end = end; 1128 seg->first_page = fp; 1129 1130 rw_wlock(&vm_phys_fictitious_reg_lock); 1131 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1132 rw_wunlock(&vm_phys_fictitious_reg_lock); 1133 1134 return (0); 1135 } 1136 1137 void 1138 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1139 { 1140 struct vm_phys_fictitious_seg *seg, tmp; 1141 #ifdef VM_PHYSSEG_DENSE 1142 long pi, pe; 1143 #endif 1144 1145 KASSERT(start < end, 1146 ("Start of segment isn't less than end (start: %jx end: %jx)", 1147 (uintmax_t)start, (uintmax_t)end)); 1148 1149 #ifdef VM_PHYSSEG_DENSE 1150 pi = atop(start); 1151 pe = atop(end); 1152 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1153 if ((pe - first_page) <= vm_page_array_size) { 1154 /* 1155 * This segment was allocated using vm_page_array 1156 * only, there's nothing to do since those pages 1157 * were never added to the tree. 1158 */ 1159 return; 1160 } 1161 /* 1162 * We have a segment that starts inside 1163 * of vm_page_array, but ends outside of it. 1164 * 1165 * Calculate how many pages were added to the 1166 * tree and free them. 1167 */ 1168 start = ptoa(first_page + vm_page_array_size); 1169 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1170 /* 1171 * We have a segment that ends inside of vm_page_array, 1172 * but starts outside of it. 1173 */ 1174 end = ptoa(first_page); 1175 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1176 /* Since it's not possible to register such a range, panic. */ 1177 panic( 1178 "Unregistering not registered fictitious range [%#jx:%#jx]", 1179 (uintmax_t)start, (uintmax_t)end); 1180 } 1181 #endif 1182 tmp.start = start; 1183 tmp.end = 0; 1184 1185 rw_wlock(&vm_phys_fictitious_reg_lock); 1186 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1187 if (seg->start != start || seg->end != end) { 1188 rw_wunlock(&vm_phys_fictitious_reg_lock); 1189 panic( 1190 "Unregistering not registered fictitious range [%#jx:%#jx]", 1191 (uintmax_t)start, (uintmax_t)end); 1192 } 1193 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1194 rw_wunlock(&vm_phys_fictitious_reg_lock); 1195 free(seg->first_page, M_FICT_PAGES); 1196 free(seg, M_FICT_PAGES); 1197 } 1198 1199 /* 1200 * Free a contiguous, power of two-sized set of physical pages. 1201 * 1202 * The free page queues must be locked. 1203 */ 1204 void 1205 vm_phys_free_pages(vm_page_t m, int order) 1206 { 1207 struct vm_freelist *fl; 1208 struct vm_phys_seg *seg; 1209 vm_paddr_t pa; 1210 vm_page_t m_buddy; 1211 1212 KASSERT(m->order == VM_NFREEORDER, 1213 ("vm_phys_free_pages: page %p has unexpected order %d", 1214 m, m->order)); 1215 KASSERT(vm_phys_pool_valid(m->pool), 1216 ("vm_phys_free_pages: page %p has unexpected pool %d", 1217 m, m->pool)); 1218 KASSERT(order < VM_NFREEORDER, 1219 ("vm_phys_free_pages: order %d is out of range", order)); 1220 seg = &vm_phys_segs[m->segind]; 1221 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1222 if (order < VM_NFREEORDER - 1) { 1223 pa = VM_PAGE_TO_PHYS(m); 1224 do { 1225 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1226 if (pa < seg->start || pa >= seg->end) 1227 break; 1228 m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa); 1229 if (m_buddy->order != order) 1230 break; 1231 fl = (*seg->free_queues)[m_buddy->pool]; 1232 vm_freelist_rem(fl, m_buddy, order); 1233 if (m_buddy->pool != m->pool) 1234 vm_phys_set_pool(m->pool, m_buddy, order); 1235 order++; 1236 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1237 m = vm_phys_seg_paddr_to_vm_page(seg, pa); 1238 } while (order < VM_NFREEORDER - 1); 1239 } 1240 fl = (*seg->free_queues)[m->pool]; 1241 vm_freelist_add(fl, m, order, 1); 1242 } 1243 1244 #ifdef VM_FREEPOOL_LAZYINIT 1245 /* 1246 * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving 1247 * them to the default pool. This is a prerequisite for some rare operations 1248 * which need to scan the page array and thus depend on all pages being 1249 * initialized. 1250 */ 1251 static void 1252 vm_phys_lazy_init_domain(int domain, bool locked) 1253 { 1254 static bool initdone[MAXMEMDOM]; 1255 struct vm_domain *vmd; 1256 struct vm_freelist *fl; 1257 vm_page_t m; 1258 int pind; 1259 bool unlocked; 1260 1261 if (__predict_true(atomic_load_bool(&initdone[domain]))) 1262 return; 1263 1264 vmd = VM_DOMAIN(domain); 1265 if (locked) 1266 vm_domain_free_assert_locked(vmd); 1267 else 1268 vm_domain_free_lock(vmd); 1269 if (atomic_load_bool(&initdone[domain])) 1270 goto out; 1271 pind = VM_FREEPOOL_LAZYINIT; 1272 for (int freelist = 0; freelist < VM_NFREELIST; freelist++) { 1273 int flind; 1274 1275 flind = vm_freelist_to_flind[freelist]; 1276 if (flind < 0) 1277 continue; 1278 fl = vm_phys_free_queues[domain][flind][pind]; 1279 for (int oind = 0; oind < VM_NFREEORDER; oind++) { 1280 if (atomic_load_int(&fl[oind].lcnt) == 0) 1281 continue; 1282 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 1283 /* 1284 * Avoid holding the lock across the 1285 * initialization unless there's a free page 1286 * shortage. 1287 */ 1288 vm_freelist_rem(fl, m, oind); 1289 unlocked = vm_domain_allocate(vmd, 1290 VM_ALLOC_NORMAL, 1 << oind); 1291 if (unlocked) 1292 vm_domain_free_unlock(vmd); 1293 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1294 if (unlocked) { 1295 vm_domain_freecnt_inc(vmd, 1 << oind); 1296 vm_domain_free_lock(vmd); 1297 } 1298 vm_phys_free_pages(m, oind); 1299 } 1300 } 1301 } 1302 atomic_store_bool(&initdone[domain], true); 1303 out: 1304 if (!locked) 1305 vm_domain_free_unlock(vmd); 1306 } 1307 1308 static void 1309 vm_phys_lazy_init(void) 1310 { 1311 for (int domain = 0; domain < vm_ndomains; domain++) 1312 vm_phys_lazy_init_domain(domain, false); 1313 atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT); 1314 } 1315 1316 static void 1317 vm_phys_lazy_init_kthr(void *arg __unused) 1318 { 1319 vm_phys_lazy_init(); 1320 kthread_exit(); 1321 } 1322 1323 static void 1324 vm_phys_lazy_sysinit(void *arg __unused) 1325 { 1326 struct thread *td; 1327 int error; 1328 1329 error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td, 1330 RFSTOPPED, 0, "vmlazyinit"); 1331 if (error == 0) { 1332 thread_lock(td); 1333 sched_prio(td, PRI_MIN_IDLE); 1334 sched_add(td, SRQ_BORING); 1335 } else { 1336 printf("%s: could not create lazy init thread: %d\n", 1337 __func__, error); 1338 vm_phys_lazy_init(); 1339 } 1340 } 1341 SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit, 1342 NULL); 1343 #endif /* VM_FREEPOOL_LAZYINIT */ 1344 1345 /* 1346 * Free a contiguous, arbitrarily sized set of physical pages, without 1347 * merging across set boundaries. 1348 * 1349 * The free page queues must be locked. 1350 */ 1351 void 1352 vm_phys_enqueue_contig(vm_page_t m, u_long npages) 1353 { 1354 struct vm_freelist *fl; 1355 struct vm_phys_seg *seg; 1356 vm_page_t m_end; 1357 vm_paddr_t diff, lo; 1358 int order; 1359 1360 /* 1361 * Avoid unnecessary coalescing by freeing the pages in the largest 1362 * possible power-of-two-sized subsets. 1363 */ 1364 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1365 seg = &vm_phys_segs[m->segind]; 1366 fl = (*seg->free_queues)[m->pool]; 1367 m_end = m + npages; 1368 /* Free blocks of increasing size. */ 1369 lo = atop(VM_PAGE_TO_PHYS(m)); 1370 if (m < m_end && 1371 (diff = lo ^ (lo + npages - 1)) != 0) { 1372 order = min(ilog2(diff), VM_NFREEORDER - 1); 1373 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1); 1374 } 1375 1376 /* Free blocks of maximum size. */ 1377 order = VM_NFREEORDER - 1; 1378 while (m + (1 << order) <= m_end) { 1379 KASSERT(seg == &vm_phys_segs[m->segind], 1380 ("%s: page range [%p,%p) spans multiple segments", 1381 __func__, m_end - npages, m)); 1382 vm_phys_enq_chunk(fl, m, order, 1); 1383 m += 1 << order; 1384 } 1385 /* Free blocks of diminishing size. */ 1386 vm_phys_enq_beg(m, m_end - m, fl, 1); 1387 } 1388 1389 /* 1390 * Free a contiguous, arbitrarily sized set of physical pages. 1391 * 1392 * The free page queues must be locked. 1393 */ 1394 void 1395 vm_phys_free_contig(vm_page_t m, u_long npages) 1396 { 1397 vm_paddr_t lo; 1398 vm_page_t m_start, m_end; 1399 unsigned max_order, order_start, order_end; 1400 1401 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1402 1403 lo = atop(VM_PAGE_TO_PHYS(m)); 1404 max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1); 1405 1406 m_start = m; 1407 order_start = ffsll(lo) - 1; 1408 if (order_start < max_order) 1409 m_start += 1 << order_start; 1410 m_end = m + npages; 1411 order_end = ffsll(lo + npages) - 1; 1412 if (order_end < max_order) 1413 m_end -= 1 << order_end; 1414 /* 1415 * Avoid unnecessary coalescing by freeing the pages at the start and 1416 * end of the range last. 1417 */ 1418 if (m_start < m_end) 1419 vm_phys_enqueue_contig(m_start, m_end - m_start); 1420 if (order_start < max_order) 1421 vm_phys_free_pages(m, order_start); 1422 if (order_end < max_order) 1423 vm_phys_free_pages(m_end, order_end); 1424 } 1425 1426 /* 1427 * Identify the first address range within segment segind or greater 1428 * that matches the domain, lies within the low/high range, and has 1429 * enough pages. Return -1 if there is none. 1430 */ 1431 int 1432 vm_phys_find_range(vm_page_t bounds[], int segind, int domain, 1433 u_long npages, vm_paddr_t low, vm_paddr_t high) 1434 { 1435 vm_paddr_t pa_end, pa_start; 1436 struct vm_phys_seg *end_seg, *seg; 1437 1438 KASSERT(npages > 0, ("npages is zero")); 1439 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range")); 1440 end_seg = &vm_phys_segs[vm_phys_nsegs]; 1441 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) { 1442 if (seg->domain != domain) 1443 continue; 1444 if (seg->start >= high) 1445 return (-1); 1446 pa_start = MAX(low, seg->start); 1447 pa_end = MIN(high, seg->end); 1448 if (pa_end - pa_start < ptoa(npages)) 1449 continue; 1450 #ifdef VM_FREEPOOL_LAZYINIT 1451 /* 1452 * The pages on the free lists must be initialized. 1453 */ 1454 vm_phys_lazy_init_domain(domain, false); 1455 #endif 1456 bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start); 1457 bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; 1458 return (seg - vm_phys_segs); 1459 } 1460 return (-1); 1461 } 1462 1463 /* 1464 * Search for the given physical page "m" in the free lists. If the search 1465 * succeeds, remove "m" from the free lists and return true. Otherwise, return 1466 * false, indicating that "m" is not in the free lists. 1467 * 1468 * The free page queues must be locked. 1469 */ 1470 bool 1471 vm_phys_unfree_page(vm_paddr_t pa) 1472 { 1473 struct vm_freelist *fl; 1474 struct vm_phys_seg *seg; 1475 vm_paddr_t pa_half; 1476 vm_page_t m, m_set, m_tmp; 1477 int order; 1478 1479 seg = vm_phys_paddr_to_seg(pa); 1480 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1481 1482 /* 1483 * The pages on the free lists must be initialized. 1484 */ 1485 #ifdef VM_FREEPOOL_LAZYINIT 1486 vm_phys_lazy_init_domain(seg->domain, true); 1487 #endif 1488 1489 /* 1490 * First, find the contiguous, power of two-sized set of free 1491 * physical pages containing the given physical page "m" and 1492 * assign it to "m_set". 1493 */ 1494 m = vm_phys_paddr_to_vm_page(pa); 1495 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1496 order < VM_NFREEORDER - 1; ) { 1497 order++; 1498 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1499 if (pa >= seg->start) 1500 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa); 1501 else 1502 return (false); 1503 } 1504 if (m_set->order < order) 1505 return (false); 1506 if (m_set->order == VM_NFREEORDER) 1507 return (false); 1508 KASSERT(m_set->order < VM_NFREEORDER, 1509 ("vm_phys_unfree_page: page %p has unexpected order %d", 1510 m_set, m_set->order)); 1511 1512 /* 1513 * Next, remove "m_set" from the free lists. Finally, extract 1514 * "m" from "m_set" using an iterative algorithm: While "m_set" 1515 * is larger than a page, shrink "m_set" by returning the half 1516 * of "m_set" that does not contain "m" to the free lists. 1517 */ 1518 fl = (*seg->free_queues)[m_set->pool]; 1519 order = m_set->order; 1520 vm_freelist_rem(fl, m_set, order); 1521 while (order > 0) { 1522 order--; 1523 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1524 if (m->phys_addr < pa_half) 1525 m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1526 else { 1527 m_tmp = m_set; 1528 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1529 } 1530 vm_freelist_add(fl, m_tmp, order, 0); 1531 } 1532 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1533 return (true); 1534 } 1535 1536 /* 1537 * Find a run of contiguous physical pages, meeting alignment requirements, from 1538 * a list of max-sized page blocks, where we need at least two consecutive 1539 * blocks to satisfy the (large) page request. 1540 */ 1541 static vm_page_t 1542 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages, 1543 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1544 { 1545 struct vm_phys_seg *seg; 1546 vm_page_t m, m_iter, m_ret; 1547 vm_paddr_t max_size, size; 1548 int max_order; 1549 1550 max_order = VM_NFREEORDER - 1; 1551 size = npages << PAGE_SHIFT; 1552 max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order); 1553 KASSERT(size > max_size, ("size is too small")); 1554 1555 /* 1556 * In order to avoid examining any free max-sized page block more than 1557 * twice, identify the ones that are first in a physically-contiguous 1558 * sequence of such blocks, and only for those walk the sequence to 1559 * check if there are enough free blocks starting at a properly aligned 1560 * block. Thus, no block is checked for free-ness more than twice. 1561 */ 1562 TAILQ_FOREACH(m, &fl[max_order].pl, listq) { 1563 /* 1564 * Skip m unless it is first in a sequence of free max page 1565 * blocks >= low in its segment. 1566 */ 1567 seg = &vm_phys_segs[m->segind]; 1568 if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start)) 1569 continue; 1570 if (VM_PAGE_TO_PHYS(m) >= max_size && 1571 VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) && 1572 max_order == m[-1 << max_order].order) 1573 continue; 1574 1575 /* 1576 * Advance m_ret from m to the first of the sequence, if any, 1577 * that satisfies alignment conditions and might leave enough 1578 * space. 1579 */ 1580 m_ret = m; 1581 while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret), 1582 size, alignment, boundary) && 1583 VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) && 1584 max_order == m_ret[1 << max_order].order) 1585 m_ret += 1 << max_order; 1586 1587 /* 1588 * Skip m unless some block m_ret in the sequence is properly 1589 * aligned, and begins a sequence of enough pages less than 1590 * high, and in the same segment. 1591 */ 1592 if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end)) 1593 continue; 1594 1595 /* 1596 * Skip m unless the blocks to allocate starting at m_ret are 1597 * all free. 1598 */ 1599 for (m_iter = m_ret; 1600 m_iter < m_ret + npages && max_order == m_iter->order; 1601 m_iter += 1 << max_order) { 1602 } 1603 if (m_iter < m_ret + npages) 1604 continue; 1605 return (m_ret); 1606 } 1607 return (NULL); 1608 } 1609 1610 /* 1611 * Find a run of contiguous physical pages from the specified free list 1612 * table. 1613 */ 1614 static vm_page_t 1615 vm_phys_find_queues_contig( 1616 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], 1617 u_long npages, vm_paddr_t low, vm_paddr_t high, 1618 u_long alignment, vm_paddr_t boundary) 1619 { 1620 struct vm_freelist *fl; 1621 vm_page_t m_ret; 1622 vm_paddr_t pa, pa_end, size; 1623 int oind, order, pind; 1624 1625 KASSERT(npages > 0, ("npages is 0")); 1626 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1627 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1628 /* Compute the queue that is the best fit for npages. */ 1629 order = flsl(npages - 1); 1630 /* Search for a large enough free block. */ 1631 size = npages << PAGE_SHIFT; 1632 for (oind = order; oind < VM_NFREEORDER; oind++) { 1633 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1634 fl = (*queues)[pind]; 1635 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1636 /* 1637 * Determine if the address range starting at pa 1638 * is within the given range, satisfies the 1639 * given alignment, and does not cross the given 1640 * boundary. 1641 */ 1642 pa = VM_PAGE_TO_PHYS(m_ret); 1643 pa_end = pa + size; 1644 if (low <= pa && pa_end <= high && 1645 vm_addr_ok(pa, size, alignment, boundary)) 1646 return (m_ret); 1647 } 1648 } 1649 } 1650 if (order < VM_NFREEORDER) 1651 return (NULL); 1652 /* Search for a long-enough sequence of max-order blocks. */ 1653 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1654 fl = (*queues)[pind]; 1655 m_ret = vm_phys_find_freelist_contig(fl, npages, 1656 low, high, alignment, boundary); 1657 if (m_ret != NULL) 1658 return (m_ret); 1659 } 1660 return (NULL); 1661 } 1662 1663 /* 1664 * Allocate a contiguous set of physical pages of the given size 1665 * "npages" from the free lists. All of the physical pages must be at 1666 * or above the given physical address "low" and below the given 1667 * physical address "high". The given value "alignment" determines the 1668 * alignment of the first physical page in the set. If the given value 1669 * "boundary" is non-zero, then the set of physical pages cannot cross 1670 * any physical address boundary that is a multiple of that value. Both 1671 * "alignment" and "boundary" must be a power of two. 1672 */ 1673 vm_page_t 1674 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1675 u_long alignment, vm_paddr_t boundary) 1676 { 1677 vm_paddr_t pa_end, pa_start; 1678 struct vm_freelist *fl; 1679 vm_page_t m, m_run; 1680 struct vm_phys_seg *seg; 1681 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; 1682 int oind, segind; 1683 1684 KASSERT(npages > 0, ("npages is 0")); 1685 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1686 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1687 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1688 if (low >= high) 1689 return (NULL); 1690 queues = NULL; 1691 m_run = NULL; 1692 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1693 seg = &vm_phys_segs[segind]; 1694 if (seg->start >= high || seg->domain != domain) 1695 continue; 1696 if (low >= seg->end) 1697 break; 1698 if (low <= seg->start) 1699 pa_start = seg->start; 1700 else 1701 pa_start = low; 1702 if (high < seg->end) 1703 pa_end = high; 1704 else 1705 pa_end = seg->end; 1706 if (pa_end - pa_start < ptoa(npages)) 1707 continue; 1708 /* 1709 * If a previous segment led to a search using 1710 * the same free lists as would this segment, then 1711 * we've actually already searched within this 1712 * too. So skip it. 1713 */ 1714 if (seg->free_queues == queues) 1715 continue; 1716 queues = seg->free_queues; 1717 m_run = vm_phys_find_queues_contig(queues, npages, 1718 low, high, alignment, boundary); 1719 if (m_run != NULL) 1720 break; 1721 } 1722 if (m_run == NULL) 1723 return (NULL); 1724 1725 /* Allocate pages from the page-range found. */ 1726 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { 1727 fl = (*queues)[m->pool]; 1728 oind = m->order; 1729 vm_freelist_rem(fl, m, oind); 1730 if (m->pool != VM_FREEPOOL_DEFAULT) 1731 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1732 } 1733 /* Return excess pages to the free lists. */ 1734 fl = (*queues)[VM_FREEPOOL_DEFAULT]; 1735 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0); 1736 1737 /* Return page verified to satisfy conditions of request. */ 1738 pa_start = VM_PAGE_TO_PHYS(m_run); 1739 KASSERT(low <= pa_start, 1740 ("memory allocated below minimum requested range")); 1741 KASSERT(pa_start + ptoa(npages) <= high, 1742 ("memory allocated above maximum requested range")); 1743 seg = &vm_phys_segs[m_run->segind]; 1744 KASSERT(seg->domain == domain, 1745 ("memory not allocated from specified domain")); 1746 KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary), 1747 ("memory alignment/boundary constraints not satisfied")); 1748 return (m_run); 1749 } 1750 1751 /* 1752 * Return the index of the first unused slot which may be the terminating 1753 * entry. 1754 */ 1755 static int 1756 vm_phys_avail_count(void) 1757 { 1758 int i; 1759 1760 for (i = 0; phys_avail[i + 1]; i += 2) 1761 continue; 1762 if (i > PHYS_AVAIL_ENTRIES) 1763 panic("Improperly terminated phys_avail %d entries", i); 1764 1765 return (i); 1766 } 1767 1768 /* 1769 * Assert that a phys_avail entry is valid. 1770 */ 1771 static void 1772 vm_phys_avail_check(int i) 1773 { 1774 if (phys_avail[i] & PAGE_MASK) 1775 panic("Unaligned phys_avail[%d]: %#jx", i, 1776 (intmax_t)phys_avail[i]); 1777 if (phys_avail[i+1] & PAGE_MASK) 1778 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1779 (intmax_t)phys_avail[i]); 1780 if (phys_avail[i + 1] < phys_avail[i]) 1781 panic("phys_avail[%d] start %#jx < end %#jx", i, 1782 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1783 } 1784 1785 /* 1786 * Return the index of an overlapping phys_avail entry or -1. 1787 */ 1788 #ifdef NUMA 1789 static int 1790 vm_phys_avail_find(vm_paddr_t pa) 1791 { 1792 int i; 1793 1794 for (i = 0; phys_avail[i + 1]; i += 2) 1795 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1796 return (i); 1797 return (-1); 1798 } 1799 #endif 1800 1801 /* 1802 * Return the index of the largest entry. 1803 */ 1804 int 1805 vm_phys_avail_largest(void) 1806 { 1807 vm_paddr_t sz, largesz; 1808 int largest; 1809 int i; 1810 1811 largest = 0; 1812 largesz = 0; 1813 for (i = 0; phys_avail[i + 1]; i += 2) { 1814 sz = vm_phys_avail_size(i); 1815 if (sz > largesz) { 1816 largesz = sz; 1817 largest = i; 1818 } 1819 } 1820 1821 return (largest); 1822 } 1823 1824 vm_paddr_t 1825 vm_phys_avail_size(int i) 1826 { 1827 1828 return (phys_avail[i + 1] - phys_avail[i]); 1829 } 1830 1831 /* 1832 * Split an entry at the address 'pa'. Return zero on success or errno. 1833 */ 1834 static int 1835 vm_phys_avail_split(vm_paddr_t pa, int i) 1836 { 1837 int cnt; 1838 1839 vm_phys_avail_check(i); 1840 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1841 panic("vm_phys_avail_split: invalid address"); 1842 cnt = vm_phys_avail_count(); 1843 if (cnt >= PHYS_AVAIL_ENTRIES) 1844 return (ENOSPC); 1845 memmove(&phys_avail[i + 2], &phys_avail[i], 1846 (cnt - i) * sizeof(phys_avail[0])); 1847 phys_avail[i + 1] = pa; 1848 phys_avail[i + 2] = pa; 1849 vm_phys_avail_check(i); 1850 vm_phys_avail_check(i+2); 1851 1852 return (0); 1853 } 1854 1855 /* 1856 * Check if a given physical address can be included as part of a crash dump. 1857 */ 1858 bool 1859 vm_phys_is_dumpable(vm_paddr_t pa) 1860 { 1861 vm_page_t m; 1862 int i; 1863 1864 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1865 return ((m->flags & PG_NODUMP) == 0); 1866 1867 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1868 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1869 return (true); 1870 } 1871 return (false); 1872 } 1873 1874 void 1875 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1876 { 1877 struct vm_phys_seg *seg; 1878 1879 if (vm_phys_early_nsegs == -1) 1880 panic("%s: called after initialization", __func__); 1881 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1882 panic("%s: ran out of early segments", __func__); 1883 1884 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1885 seg->start = start; 1886 seg->end = end; 1887 } 1888 1889 /* 1890 * This routine allocates NUMA node specific memory before the page 1891 * allocator is bootstrapped. 1892 */ 1893 vm_paddr_t 1894 vm_phys_early_alloc(int domain, size_t alloc_size) 1895 { 1896 #ifdef NUMA 1897 int mem_index; 1898 #endif 1899 int i, biggestone; 1900 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1901 1902 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1903 ("%s: invalid domain index %d", __func__, domain)); 1904 1905 /* 1906 * Search the mem_affinity array for the biggest address 1907 * range in the desired domain. This is used to constrain 1908 * the phys_avail selection below. 1909 */ 1910 biggestsize = 0; 1911 mem_start = 0; 1912 mem_end = -1; 1913 #ifdef NUMA 1914 mem_index = 0; 1915 if (mem_affinity != NULL) { 1916 for (i = 0;; i++) { 1917 size = mem_affinity[i].end - mem_affinity[i].start; 1918 if (size == 0) 1919 break; 1920 if (domain != -1 && mem_affinity[i].domain != domain) 1921 continue; 1922 if (size > biggestsize) { 1923 mem_index = i; 1924 biggestsize = size; 1925 } 1926 } 1927 mem_start = mem_affinity[mem_index].start; 1928 mem_end = mem_affinity[mem_index].end; 1929 } 1930 #endif 1931 1932 /* 1933 * Now find biggest physical segment in within the desired 1934 * numa domain. 1935 */ 1936 biggestsize = 0; 1937 biggestone = 0; 1938 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1939 /* skip regions that are out of range */ 1940 if (phys_avail[i+1] - alloc_size < mem_start || 1941 phys_avail[i+1] > mem_end) 1942 continue; 1943 size = vm_phys_avail_size(i); 1944 if (size > biggestsize) { 1945 biggestone = i; 1946 biggestsize = size; 1947 } 1948 } 1949 alloc_size = round_page(alloc_size); 1950 1951 /* 1952 * Grab single pages from the front to reduce fragmentation. 1953 */ 1954 if (alloc_size == PAGE_SIZE) { 1955 pa = phys_avail[biggestone]; 1956 phys_avail[biggestone] += PAGE_SIZE; 1957 vm_phys_avail_check(biggestone); 1958 return (pa); 1959 } 1960 1961 /* 1962 * Naturally align large allocations. 1963 */ 1964 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1965 if (alloc_size + align > biggestsize) 1966 panic("cannot find a large enough size\n"); 1967 if (align != 0 && 1968 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1969 biggestone) != 0) 1970 /* Wasting memory. */ 1971 phys_avail[biggestone + 1] -= align; 1972 1973 phys_avail[biggestone + 1] -= alloc_size; 1974 vm_phys_avail_check(biggestone); 1975 pa = phys_avail[biggestone + 1]; 1976 return (pa); 1977 } 1978 1979 void 1980 vm_phys_early_startup(void) 1981 { 1982 struct vm_phys_seg *seg; 1983 int i; 1984 1985 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1986 phys_avail[i] = round_page(phys_avail[i]); 1987 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1988 } 1989 1990 for (i = 0; i < vm_phys_early_nsegs; i++) { 1991 seg = &vm_phys_early_segs[i]; 1992 vm_phys_add_seg(seg->start, seg->end); 1993 } 1994 vm_phys_early_nsegs = -1; 1995 1996 #ifdef NUMA 1997 /* Force phys_avail to be split by domain. */ 1998 if (mem_affinity != NULL) { 1999 int idx; 2000 2001 for (i = 0; mem_affinity[i].end != 0; i++) { 2002 idx = vm_phys_avail_find(mem_affinity[i].start); 2003 if (idx != -1 && 2004 phys_avail[idx] != mem_affinity[i].start) 2005 vm_phys_avail_split(mem_affinity[i].start, idx); 2006 idx = vm_phys_avail_find(mem_affinity[i].end); 2007 if (idx != -1 && 2008 phys_avail[idx] != mem_affinity[i].end) 2009 vm_phys_avail_split(mem_affinity[i].end, idx); 2010 } 2011 } 2012 #endif 2013 } 2014 2015 #ifdef DDB 2016 /* 2017 * Show the number of physical pages in each of the free lists. 2018 */ 2019 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) 2020 { 2021 struct vm_freelist *fl; 2022 int flind, oind, pind, dom; 2023 2024 for (dom = 0; dom < vm_ndomains; dom++) { 2025 db_printf("DOMAIN: %d\n", dom); 2026 for (flind = 0; flind < vm_nfreelists; flind++) { 2027 db_printf("FREE LIST %d:\n" 2028 "\n ORDER (SIZE) | NUMBER" 2029 "\n ", flind); 2030 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2031 db_printf(" | POOL %d", pind); 2032 db_printf("\n-- "); 2033 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2034 db_printf("-- -- "); 2035 db_printf("--\n"); 2036 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 2037 db_printf(" %2.2d (%6.6dK)", oind, 2038 1 << (PAGE_SHIFT - 10 + oind)); 2039 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 2040 fl = vm_phys_free_queues[dom][flind][pind]; 2041 db_printf(" | %6.6d", fl[oind].lcnt); 2042 } 2043 db_printf("\n"); 2044 } 2045 db_printf("\n"); 2046 } 2047 db_printf("\n"); 2048 } 2049 } 2050 #endif 2051