1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_vm.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/domainset.h> 48 #include <sys/lock.h> 49 #include <sys/kernel.h> 50 #include <sys/kthread.h> 51 #include <sys/malloc.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/queue.h> 55 #include <sys/rwlock.h> 56 #include <sys/sbuf.h> 57 #include <sys/sched.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/tslog.h> 61 #include <sys/unistd.h> 62 #include <sys/vmmeter.h> 63 64 #include <ddb/ddb.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_extern.h> 68 #include <vm/vm_param.h> 69 #include <vm/vm_kern.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_phys.h> 73 #include <vm/vm_pagequeue.h> 74 75 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 76 "Too many physsegs."); 77 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t), 78 "vm_paddr_t too big for ffsll, flsll."); 79 80 #ifdef NUMA 81 struct mem_affinity __read_mostly *mem_affinity; 82 int __read_mostly *mem_locality; 83 84 static int numa_disabled; 85 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 86 "NUMA options"); 87 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 88 &numa_disabled, 0, "NUMA-awareness in the allocators is disabled"); 89 #endif 90 91 int __read_mostly vm_ndomains = 1; 92 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 93 94 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 95 int __read_mostly vm_phys_nsegs; 96 static struct vm_phys_seg vm_phys_early_segs[8]; 97 static int vm_phys_early_nsegs; 98 99 struct vm_phys_fictitious_seg; 100 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 101 struct vm_phys_fictitious_seg *); 102 103 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 104 RB_INITIALIZER(&vm_phys_fictitious_tree); 105 106 struct vm_phys_fictitious_seg { 107 RB_ENTRY(vm_phys_fictitious_seg) node; 108 /* Memory region data */ 109 vm_paddr_t start; 110 vm_paddr_t end; 111 vm_page_t first_page; 112 }; 113 114 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 115 vm_phys_fictitious_cmp); 116 117 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 118 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 119 120 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 121 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 122 [VM_NFREEORDER_MAX]; 123 124 static int __read_mostly vm_nfreelists; 125 126 /* 127 * These "avail lists" are globals used to communicate boot-time physical 128 * memory layout to other parts of the kernel. Each physically contiguous 129 * region of memory is defined by a start address at an even index and an 130 * end address at the following odd index. Each list is terminated by a 131 * pair of zero entries. 132 * 133 * dump_avail tells the dump code what regions to include in a crash dump, and 134 * phys_avail is all of the remaining physical memory that is available for 135 * the vm system. 136 * 137 * Initially dump_avail and phys_avail are identical. Boot time memory 138 * allocations remove extents from phys_avail that may still be included 139 * in dumps. 140 */ 141 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 142 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 143 144 /* 145 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 146 */ 147 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 148 static int __read_mostly vm_default_freepool; 149 150 CTASSERT(VM_FREELIST_DEFAULT == 0); 151 152 #ifdef VM_FREELIST_DMA32 153 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 154 #endif 155 156 /* 157 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 158 * the ordering of the free list boundaries. 159 */ 160 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 161 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 162 #endif 163 164 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 165 SYSCTL_OID(_vm, OID_AUTO, phys_free, 166 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 167 sysctl_vm_phys_free, "A", 168 "Phys Free Info"); 169 170 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 171 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 172 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 173 sysctl_vm_phys_segs, "A", 174 "Phys Seg Info"); 175 176 #ifdef NUMA 177 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 178 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 179 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 180 sysctl_vm_phys_locality, "A", 181 "Phys Locality Info"); 182 #endif 183 184 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 185 &vm_ndomains, 0, "Number of physical memory domains available."); 186 187 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 188 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 189 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 190 int order, int tail); 191 192 static bool __diagused 193 vm_phys_pool_valid(int pool) 194 { 195 #ifdef VM_FREEPOOL_LAZYINIT 196 if (pool == VM_FREEPOOL_LAZYINIT) 197 return (false); 198 #endif 199 return (pool >= 0 && pool < VM_NFREEPOOL); 200 } 201 202 /* 203 * Red-black tree helpers for vm fictitious range management. 204 */ 205 static inline int 206 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 207 struct vm_phys_fictitious_seg *range) 208 { 209 210 KASSERT(range->start != 0 && range->end != 0, 211 ("Invalid range passed on search for vm_fictitious page")); 212 if (p->start >= range->end) 213 return (1); 214 if (p->start < range->start) 215 return (-1); 216 217 return (0); 218 } 219 220 static int 221 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 222 struct vm_phys_fictitious_seg *p2) 223 { 224 225 /* Check if this is a search for a page */ 226 if (p1->end == 0) 227 return (vm_phys_fictitious_in_range(p1, p2)); 228 229 KASSERT(p2->end != 0, 230 ("Invalid range passed as second parameter to vm fictitious comparison")); 231 232 /* Searching to add a new range */ 233 if (p1->end <= p2->start) 234 return (-1); 235 if (p1->start >= p2->end) 236 return (1); 237 238 panic("Trying to add overlapping vm fictitious ranges:\n" 239 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 240 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 241 } 242 243 int 244 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used, 245 vm_paddr_t high __numa_used) 246 { 247 #ifdef NUMA 248 domainset_t mask; 249 int i; 250 251 if (vm_ndomains == 1 || mem_affinity == NULL) 252 return (0); 253 254 DOMAINSET_ZERO(&mask); 255 /* 256 * Check for any memory that overlaps low, high. 257 */ 258 for (i = 0; mem_affinity[i].end != 0; i++) 259 if (mem_affinity[i].start <= high && 260 mem_affinity[i].end >= low) 261 DOMAINSET_SET(mem_affinity[i].domain, &mask); 262 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 263 return (prefer); 264 if (DOMAINSET_EMPTY(&mask)) 265 panic("vm_phys_domain_match: Impossible constraint"); 266 return (DOMAINSET_FFS(&mask) - 1); 267 #else 268 return (0); 269 #endif 270 } 271 272 /* 273 * Outputs the state of the physical memory allocator, specifically, 274 * the amount of physical memory in each free list. 275 */ 276 static int 277 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 278 { 279 struct sbuf sbuf; 280 struct vm_freelist *fl; 281 int dom, error, flind, oind, pind; 282 283 error = sysctl_wire_old_buffer(req, 0); 284 if (error != 0) 285 return (error); 286 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 287 for (dom = 0; dom < vm_ndomains; dom++) { 288 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 289 for (flind = 0; flind < vm_nfreelists; flind++) { 290 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 291 "\n ORDER (SIZE) | NUMBER" 292 "\n ", flind); 293 for (pind = 0; pind < VM_NFREEPOOL; pind++) 294 sbuf_printf(&sbuf, " | POOL %d", pind); 295 sbuf_printf(&sbuf, "\n-- "); 296 for (pind = 0; pind < VM_NFREEPOOL; pind++) 297 sbuf_printf(&sbuf, "-- -- "); 298 sbuf_printf(&sbuf, "--\n"); 299 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 300 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 301 1 << (PAGE_SHIFT - 10 + oind)); 302 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 303 fl = vm_phys_free_queues[dom][flind][pind]; 304 sbuf_printf(&sbuf, " | %6d", 305 fl[oind].lcnt); 306 } 307 sbuf_printf(&sbuf, "\n"); 308 } 309 } 310 } 311 error = sbuf_finish(&sbuf); 312 sbuf_delete(&sbuf); 313 return (error); 314 } 315 316 /* 317 * Outputs the set of physical memory segments. 318 */ 319 static int 320 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 321 { 322 struct sbuf sbuf; 323 struct vm_phys_seg *seg; 324 int error, segind; 325 326 error = sysctl_wire_old_buffer(req, 0); 327 if (error != 0) 328 return (error); 329 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 330 for (segind = 0; segind < vm_phys_nsegs; segind++) { 331 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 332 seg = &vm_phys_segs[segind]; 333 sbuf_printf(&sbuf, "start: %#jx\n", 334 (uintmax_t)seg->start); 335 sbuf_printf(&sbuf, "end: %#jx\n", 336 (uintmax_t)seg->end); 337 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 338 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 339 } 340 error = sbuf_finish(&sbuf); 341 sbuf_delete(&sbuf); 342 return (error); 343 } 344 345 /* 346 * Return affinity, or -1 if there's no affinity information. 347 */ 348 int 349 vm_phys_mem_affinity(int f __numa_used, int t __numa_used) 350 { 351 352 #ifdef NUMA 353 if (mem_locality == NULL) 354 return (-1); 355 if (f >= vm_ndomains || t >= vm_ndomains) 356 return (-1); 357 return (mem_locality[f * vm_ndomains + t]); 358 #else 359 return (-1); 360 #endif 361 } 362 363 #ifdef NUMA 364 /* 365 * Outputs the VM locality table. 366 */ 367 static int 368 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 369 { 370 struct sbuf sbuf; 371 int error, i, j; 372 373 error = sysctl_wire_old_buffer(req, 0); 374 if (error != 0) 375 return (error); 376 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 377 378 sbuf_printf(&sbuf, "\n"); 379 380 for (i = 0; i < vm_ndomains; i++) { 381 sbuf_printf(&sbuf, "%d: ", i); 382 for (j = 0; j < vm_ndomains; j++) { 383 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 384 } 385 sbuf_printf(&sbuf, "\n"); 386 } 387 error = sbuf_finish(&sbuf); 388 sbuf_delete(&sbuf); 389 return (error); 390 } 391 #endif 392 393 static void 394 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 395 { 396 397 m->order = order; 398 if (tail) 399 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 400 else 401 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 402 fl[order].lcnt++; 403 } 404 405 static void 406 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 407 { 408 409 TAILQ_REMOVE(&fl[order].pl, m, listq); 410 fl[order].lcnt--; 411 m->order = VM_NFREEORDER; 412 } 413 414 /* 415 * Create a physical memory segment. 416 */ 417 static void 418 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 419 { 420 struct vm_phys_seg *seg; 421 422 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 423 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 424 KASSERT(domain >= 0 && domain < vm_ndomains, 425 ("vm_phys_create_seg: invalid domain provided")); 426 seg = &vm_phys_segs[vm_phys_nsegs++]; 427 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 428 *seg = *(seg - 1); 429 seg--; 430 } 431 seg->start = start; 432 seg->end = end; 433 seg->domain = domain; 434 } 435 436 static void 437 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 438 { 439 #ifdef NUMA 440 int i; 441 442 if (mem_affinity == NULL) { 443 _vm_phys_create_seg(start, end, 0); 444 return; 445 } 446 447 for (i = 0;; i++) { 448 if (mem_affinity[i].end == 0) 449 panic("Reached end of affinity info"); 450 if (mem_affinity[i].end <= start) 451 continue; 452 if (mem_affinity[i].start > start) 453 panic("No affinity info for start %jx", 454 (uintmax_t)start); 455 if (mem_affinity[i].end >= end) { 456 _vm_phys_create_seg(start, end, 457 mem_affinity[i].domain); 458 break; 459 } 460 _vm_phys_create_seg(start, mem_affinity[i].end, 461 mem_affinity[i].domain); 462 start = mem_affinity[i].end; 463 } 464 #else 465 _vm_phys_create_seg(start, end, 0); 466 #endif 467 } 468 469 /* 470 * Add a physical memory segment. 471 */ 472 void 473 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 474 { 475 vm_paddr_t paddr; 476 477 KASSERT((start & PAGE_MASK) == 0, 478 ("vm_phys_define_seg: start is not page aligned")); 479 KASSERT((end & PAGE_MASK) == 0, 480 ("vm_phys_define_seg: end is not page aligned")); 481 482 /* 483 * Split the physical memory segment if it spans two or more free 484 * list boundaries. 485 */ 486 paddr = start; 487 #ifdef VM_FREELIST_LOWMEM 488 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 489 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 490 paddr = VM_LOWMEM_BOUNDARY; 491 } 492 #endif 493 #ifdef VM_FREELIST_DMA32 494 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 495 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 496 paddr = VM_DMA32_BOUNDARY; 497 } 498 #endif 499 vm_phys_create_seg(paddr, end); 500 } 501 502 /* 503 * Initialize the physical memory allocator. 504 * 505 * Requires that vm_page_array is initialized! 506 */ 507 void 508 vm_phys_init(void) 509 { 510 struct vm_freelist *fl; 511 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 512 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) 513 u_long npages; 514 #endif 515 int dom, flind, freelist, oind, pind, segind; 516 517 /* 518 * Compute the number of free lists, and generate the mapping from the 519 * manifest constants VM_FREELIST_* to the free list indices. 520 * 521 * Initially, the entries of vm_freelist_to_flind[] are set to either 522 * 0 or 1 to indicate which free lists should be created. 523 */ 524 #ifdef VM_DMA32_NPAGES_THRESHOLD 525 npages = 0; 526 #endif 527 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 528 seg = &vm_phys_segs[segind]; 529 #ifdef VM_FREELIST_LOWMEM 530 if (seg->end <= VM_LOWMEM_BOUNDARY) 531 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 532 else 533 #endif 534 #ifdef VM_FREELIST_DMA32 535 if ( 536 #ifdef VM_DMA32_NPAGES_THRESHOLD 537 /* 538 * Create the DMA32 free list only if the amount of 539 * physical memory above physical address 4G exceeds the 540 * given threshold. 541 */ 542 npages > VM_DMA32_NPAGES_THRESHOLD && 543 #endif 544 seg->end <= VM_DMA32_BOUNDARY) 545 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 546 else 547 #endif 548 { 549 #ifdef VM_DMA32_NPAGES_THRESHOLD 550 npages += atop(seg->end - seg->start); 551 #endif 552 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 553 } 554 } 555 /* Change each entry into a running total of the free lists. */ 556 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 557 vm_freelist_to_flind[freelist] += 558 vm_freelist_to_flind[freelist - 1]; 559 } 560 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 561 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 562 /* Change each entry into a free list index. */ 563 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 564 vm_freelist_to_flind[freelist]--; 565 566 /* 567 * Initialize the first_page and free_queues fields of each physical 568 * memory segment. 569 */ 570 #ifdef VM_PHYSSEG_SPARSE 571 npages = 0; 572 #endif 573 for (segind = 0; segind < vm_phys_nsegs; segind++) { 574 seg = &vm_phys_segs[segind]; 575 #ifdef VM_PHYSSEG_SPARSE 576 seg->first_page = &vm_page_array[npages]; 577 npages += atop(seg->end - seg->start); 578 #else 579 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 580 #endif 581 #ifdef VM_FREELIST_LOWMEM 582 if (seg->end <= VM_LOWMEM_BOUNDARY) { 583 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 584 KASSERT(flind >= 0, 585 ("vm_phys_init: LOWMEM flind < 0")); 586 } else 587 #endif 588 #ifdef VM_FREELIST_DMA32 589 if (seg->end <= VM_DMA32_BOUNDARY) { 590 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 591 KASSERT(flind >= 0, 592 ("vm_phys_init: DMA32 flind < 0")); 593 } else 594 #endif 595 { 596 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 597 KASSERT(flind >= 0, 598 ("vm_phys_init: DEFAULT flind < 0")); 599 } 600 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 601 } 602 603 /* 604 * Coalesce physical memory segments that are contiguous and share the 605 * same per-domain free queues. 606 */ 607 prev_seg = vm_phys_segs; 608 seg = &vm_phys_segs[1]; 609 end_seg = &vm_phys_segs[vm_phys_nsegs]; 610 while (seg < end_seg) { 611 if (prev_seg->end == seg->start && 612 prev_seg->free_queues == seg->free_queues) { 613 prev_seg->end = seg->end; 614 KASSERT(prev_seg->domain == seg->domain, 615 ("vm_phys_init: free queues cannot span domains")); 616 vm_phys_nsegs--; 617 end_seg--; 618 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 619 *tmp_seg = *(tmp_seg + 1); 620 } else { 621 prev_seg = seg; 622 seg++; 623 } 624 } 625 626 /* 627 * Initialize the free queues. 628 */ 629 for (dom = 0; dom < vm_ndomains; dom++) { 630 for (flind = 0; flind < vm_nfreelists; flind++) { 631 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 632 fl = vm_phys_free_queues[dom][flind][pind]; 633 for (oind = 0; oind < VM_NFREEORDER; oind++) 634 TAILQ_INIT(&fl[oind].pl); 635 } 636 } 637 } 638 639 #ifdef VM_FREEPOOL_LAZYINIT 640 vm_default_freepool = VM_FREEPOOL_LAZYINIT; 641 #else 642 vm_default_freepool = VM_FREEPOOL_DEFAULT; 643 #endif 644 645 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 646 } 647 648 /* 649 * Register info about the NUMA topology of the system. 650 * 651 * Invoked by platform-dependent code prior to vm_phys_init(). 652 */ 653 void 654 vm_phys_register_domains(int ndomains __numa_used, 655 struct mem_affinity *affinity __numa_used, int *locality __numa_used) 656 { 657 #ifdef NUMA 658 int i; 659 660 /* 661 * For now the only override value that we support is 1, which 662 * effectively disables NUMA-awareness in the allocators. 663 */ 664 TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled); 665 if (numa_disabled) 666 ndomains = 1; 667 668 if (ndomains > 1) { 669 vm_ndomains = ndomains; 670 mem_affinity = affinity; 671 mem_locality = locality; 672 } 673 674 for (i = 0; i < vm_ndomains; i++) 675 DOMAINSET_SET(i, &all_domains); 676 #endif 677 } 678 679 /* 680 * Split a contiguous, power of two-sized set of physical pages. 681 * 682 * When this function is called by a page allocation function, the caller 683 * should request insertion at the head unless the order [order, oind) queues 684 * are known to be empty. The objective being to reduce the likelihood of 685 * long-term fragmentation by promoting contemporaneous allocation and 686 * (hopefully) deallocation. 687 */ 688 static __inline void 689 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 690 int tail) 691 { 692 vm_page_t m_buddy; 693 694 while (oind > order) { 695 oind--; 696 m_buddy = &m[1 << oind]; 697 KASSERT(m_buddy->order == VM_NFREEORDER, 698 ("vm_phys_split_pages: page %p has unexpected order %d", 699 m_buddy, m_buddy->order)); 700 vm_freelist_add(fl, m_buddy, oind, tail); 701 } 702 } 703 704 static void 705 vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int tail) 706 { 707 KASSERT(order >= 0 && order < VM_NFREEORDER, 708 ("%s: invalid order %d", __func__, order)); 709 710 vm_freelist_add(fl, m, order, tail); 711 #ifdef VM_FREEPOOL_LAZYINIT 712 if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { 713 vm_page_t m_next; 714 int npages; 715 716 npages = 1 << order; 717 m_next = m + npages; 718 vm_page_init_page(m_next, m->phys_addr + ptoa(npages), m->segind, 719 VM_FREEPOOL_LAZYINIT); 720 } 721 #endif 722 } 723 724 /* 725 * Add the physical pages [m, m + npages) at the beginning of a power-of-two 726 * aligned and sized set to the specified free list. 727 * 728 * When this function is called by a page allocation function, the caller 729 * should request insertion at the head unless the lower-order queues are 730 * known to be empty. The objective being to reduce the likelihood of long- 731 * term fragmentation by promoting contemporaneous allocation and (hopefully) 732 * deallocation. 733 * 734 * The physical page m's buddy must not be free. 735 */ 736 static void 737 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 738 { 739 int order; 740 741 KASSERT(npages == 0 || 742 (VM_PAGE_TO_PHYS(m) & 743 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 744 ("%s: page %p and npages %u are misaligned", 745 __func__, m, npages)); 746 while (npages > 0) { 747 KASSERT(m->order == VM_NFREEORDER, 748 ("%s: page %p has unexpected order %d", 749 __func__, m, m->order)); 750 order = ilog2(npages); 751 KASSERT(order < VM_NFREEORDER, 752 ("%s: order %d is out of range", __func__, order)); 753 vm_phys_enq_chunk(fl, m, order, tail); 754 m += 1 << order; 755 npages -= 1 << order; 756 } 757 } 758 759 /* 760 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 761 * and sized set to the specified free list. 762 * 763 * When this function is called by a page allocation function, the caller 764 * should request insertion at the head unless the lower-order queues are 765 * known to be empty. The objective being to reduce the likelihood of long- 766 * term fragmentation by promoting contemporaneous allocation and (hopefully) 767 * deallocation. 768 * 769 * If npages is zero, this function does nothing and ignores the physical page 770 * parameter m. Otherwise, the physical page m's buddy must not be free. 771 */ 772 static vm_page_t 773 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 774 { 775 int order; 776 777 KASSERT(npages == 0 || 778 ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 779 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 780 ("vm_phys_enq_range: page %p and npages %u are misaligned", 781 m, npages)); 782 while (npages > 0) { 783 KASSERT(m->order == VM_NFREEORDER, 784 ("vm_phys_enq_range: page %p has unexpected order %d", 785 m, m->order)); 786 order = ffs(npages) - 1; 787 vm_phys_enq_chunk(fl, m, order, tail); 788 m += 1 << order; 789 npages -= 1 << order; 790 } 791 return (m); 792 } 793 794 /* 795 * Set the pool for a contiguous, power of two-sized set of physical pages. 796 * 797 * If the pages currently belong to the lazy init pool, then the corresponding 798 * page structures must be initialized. In this case it is assumed that the 799 * first page in the run has already been initialized. 800 */ 801 static void 802 vm_phys_set_pool(int pool, vm_page_t m, int order) 803 { 804 #ifdef VM_FREEPOOL_LAZYINIT 805 if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { 806 vm_paddr_t pa; 807 int segind; 808 809 m->pool = pool; 810 811 TSENTER(); 812 pa = m->phys_addr + PAGE_SIZE; 813 segind = m->segind; 814 for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order]; 815 m_tmp++, pa += PAGE_SIZE) 816 vm_page_init_page(m_tmp, pa, segind, pool); 817 TSEXIT(); 818 } else 819 #endif 820 for (vm_page_t m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 821 m_tmp->pool = pool; 822 } 823 824 /* 825 * Tries to allocate the specified number of pages from the specified pool 826 * within the specified domain. Returns the actual number of allocated pages 827 * and a pointer to each page through the array ma[]. 828 * 829 * The returned pages may not be physically contiguous. However, in contrast 830 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 831 * calling this function once to allocate the desired number of pages will 832 * avoid wasted time in vm_phys_split_pages(). 833 * 834 * The free page queues for the specified domain must be locked. 835 */ 836 int 837 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 838 { 839 struct vm_freelist *alt, *fl; 840 vm_page_t m; 841 int avail, end, flind, freelist, i, oind, pind; 842 843 KASSERT(domain >= 0 && domain < vm_ndomains, 844 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 845 KASSERT(vm_phys_pool_valid(pool), 846 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 847 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 848 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 849 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 850 i = 0; 851 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 852 flind = vm_freelist_to_flind[freelist]; 853 if (flind < 0) 854 continue; 855 fl = vm_phys_free_queues[domain][flind][pool]; 856 for (oind = 0; oind < VM_NFREEORDER; oind++) { 857 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 858 vm_freelist_rem(fl, m, oind); 859 avail = i + (1 << oind); 860 end = imin(npages, avail); 861 while (i < end) 862 ma[i++] = m++; 863 if (i == npages) { 864 /* 865 * Return excess pages to fl. Its order 866 * [0, oind) queues are empty. 867 */ 868 vm_phys_enq_range(m, avail - i, fl, 1); 869 return (npages); 870 } 871 } 872 } 873 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 874 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; 875 pind++) { 876 alt = vm_phys_free_queues[domain][flind][pind]; 877 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 878 NULL) { 879 vm_freelist_rem(alt, m, oind); 880 vm_phys_set_pool(pool, m, oind); 881 avail = i + (1 << oind); 882 end = imin(npages, avail); 883 while (i < end) 884 ma[i++] = m++; 885 if (i == npages) { 886 /* 887 * Return excess pages to fl. 888 * Its order [0, oind) queues 889 * are empty. 890 */ 891 vm_phys_enq_range(m, avail - i, 892 fl, 1); 893 return (npages); 894 } 895 } 896 } 897 } 898 } 899 return (i); 900 } 901 902 /* 903 * Allocate a contiguous, power of two-sized set of physical pages 904 * from the free lists. 905 * 906 * The free page queues must be locked. 907 */ 908 vm_page_t 909 vm_phys_alloc_pages(int domain, int pool, int order) 910 { 911 vm_page_t m; 912 int freelist; 913 914 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 915 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 916 if (m != NULL) 917 return (m); 918 } 919 return (NULL); 920 } 921 922 /* 923 * Allocate a contiguous, power of two-sized set of physical pages from the 924 * specified free list. The free list must be specified using one of the 925 * manifest constants VM_FREELIST_*. 926 * 927 * The free page queues must be locked. 928 */ 929 vm_page_t 930 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 931 { 932 struct vm_freelist *alt, *fl; 933 vm_page_t m; 934 int oind, pind, flind; 935 936 KASSERT(domain >= 0 && domain < vm_ndomains, 937 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 938 domain)); 939 KASSERT(freelist < VM_NFREELIST, 940 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 941 freelist)); 942 KASSERT(vm_phys_pool_valid(pool), 943 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 944 KASSERT(order < VM_NFREEORDER, 945 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 946 947 flind = vm_freelist_to_flind[freelist]; 948 /* Check if freelist is present */ 949 if (flind < 0) 950 return (NULL); 951 952 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 953 fl = &vm_phys_free_queues[domain][flind][pool][0]; 954 for (oind = order; oind < VM_NFREEORDER; oind++) { 955 m = TAILQ_FIRST(&fl[oind].pl); 956 if (m != NULL) { 957 vm_freelist_rem(fl, m, oind); 958 /* The order [order, oind) queues are empty. */ 959 vm_phys_split_pages(m, oind, fl, order, 1); 960 return (m); 961 } 962 } 963 964 /* 965 * The given pool was empty. Find the largest 966 * contiguous, power-of-two-sized set of pages in any 967 * pool. Transfer these pages to the given pool, and 968 * use them to satisfy the allocation. 969 */ 970 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 971 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 972 alt = &vm_phys_free_queues[domain][flind][pind][0]; 973 m = TAILQ_FIRST(&alt[oind].pl); 974 if (m != NULL) { 975 vm_freelist_rem(alt, m, oind); 976 vm_phys_set_pool(pool, m, oind); 977 /* The order [order, oind) queues are empty. */ 978 vm_phys_split_pages(m, oind, fl, order, 1); 979 return (m); 980 } 981 } 982 } 983 return (NULL); 984 } 985 986 /* 987 * Find the vm_page corresponding to the given physical address, which must lie 988 * within the given physical memory segment. 989 */ 990 vm_page_t 991 vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa) 992 { 993 KASSERT(pa >= seg->start && pa < seg->end, 994 ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa)); 995 996 return (&seg->first_page[atop(pa - seg->start)]); 997 } 998 999 /* 1000 * Find the vm_page corresponding to the given physical address. 1001 */ 1002 vm_page_t 1003 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 1004 { 1005 struct vm_phys_seg *seg; 1006 1007 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 1008 return (vm_phys_seg_paddr_to_vm_page(seg, pa)); 1009 return (NULL); 1010 } 1011 1012 vm_page_t 1013 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 1014 { 1015 struct vm_phys_fictitious_seg tmp, *seg; 1016 vm_page_t m; 1017 1018 m = NULL; 1019 tmp.start = pa; 1020 tmp.end = 0; 1021 1022 rw_rlock(&vm_phys_fictitious_reg_lock); 1023 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1024 rw_runlock(&vm_phys_fictitious_reg_lock); 1025 if (seg == NULL) 1026 return (NULL); 1027 1028 m = &seg->first_page[atop(pa - seg->start)]; 1029 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 1030 1031 return (m); 1032 } 1033 1034 static inline void 1035 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 1036 long page_count, vm_memattr_t memattr) 1037 { 1038 long i; 1039 1040 bzero(range, page_count * sizeof(*range)); 1041 for (i = 0; i < page_count; i++) { 1042 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 1043 range[i].oflags &= ~VPO_UNMANAGED; 1044 range[i].busy_lock = VPB_UNBUSIED; 1045 } 1046 } 1047 1048 int 1049 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 1050 vm_memattr_t memattr) 1051 { 1052 struct vm_phys_fictitious_seg *seg; 1053 vm_page_t fp; 1054 long page_count; 1055 #ifdef VM_PHYSSEG_DENSE 1056 long pi, pe; 1057 long dpage_count; 1058 #endif 1059 1060 KASSERT(start < end, 1061 ("Start of segment isn't less than end (start: %jx end: %jx)", 1062 (uintmax_t)start, (uintmax_t)end)); 1063 1064 page_count = (end - start) / PAGE_SIZE; 1065 1066 #ifdef VM_PHYSSEG_DENSE 1067 pi = atop(start); 1068 pe = atop(end); 1069 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1070 fp = &vm_page_array[pi - first_page]; 1071 if ((pe - first_page) > vm_page_array_size) { 1072 /* 1073 * We have a segment that starts inside 1074 * of vm_page_array, but ends outside of it. 1075 * 1076 * Use vm_page_array pages for those that are 1077 * inside of the vm_page_array range, and 1078 * allocate the remaining ones. 1079 */ 1080 dpage_count = vm_page_array_size - (pi - first_page); 1081 vm_phys_fictitious_init_range(fp, start, dpage_count, 1082 memattr); 1083 page_count -= dpage_count; 1084 start += ptoa(dpage_count); 1085 goto alloc; 1086 } 1087 /* 1088 * We can allocate the full range from vm_page_array, 1089 * so there's no need to register the range in the tree. 1090 */ 1091 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1092 return (0); 1093 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1094 /* 1095 * We have a segment that ends inside of vm_page_array, 1096 * but starts outside of it. 1097 */ 1098 fp = &vm_page_array[0]; 1099 dpage_count = pe - first_page; 1100 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 1101 memattr); 1102 end -= ptoa(dpage_count); 1103 page_count -= dpage_count; 1104 goto alloc; 1105 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1106 /* 1107 * Trying to register a fictitious range that expands before 1108 * and after vm_page_array. 1109 */ 1110 return (EINVAL); 1111 } else { 1112 alloc: 1113 #endif 1114 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1115 M_WAITOK); 1116 #ifdef VM_PHYSSEG_DENSE 1117 } 1118 #endif 1119 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1120 1121 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1122 seg->start = start; 1123 seg->end = end; 1124 seg->first_page = fp; 1125 1126 rw_wlock(&vm_phys_fictitious_reg_lock); 1127 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1128 rw_wunlock(&vm_phys_fictitious_reg_lock); 1129 1130 return (0); 1131 } 1132 1133 void 1134 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1135 { 1136 struct vm_phys_fictitious_seg *seg, tmp; 1137 #ifdef VM_PHYSSEG_DENSE 1138 long pi, pe; 1139 #endif 1140 1141 KASSERT(start < end, 1142 ("Start of segment isn't less than end (start: %jx end: %jx)", 1143 (uintmax_t)start, (uintmax_t)end)); 1144 1145 #ifdef VM_PHYSSEG_DENSE 1146 pi = atop(start); 1147 pe = atop(end); 1148 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1149 if ((pe - first_page) <= vm_page_array_size) { 1150 /* 1151 * This segment was allocated using vm_page_array 1152 * only, there's nothing to do since those pages 1153 * were never added to the tree. 1154 */ 1155 return; 1156 } 1157 /* 1158 * We have a segment that starts inside 1159 * of vm_page_array, but ends outside of it. 1160 * 1161 * Calculate how many pages were added to the 1162 * tree and free them. 1163 */ 1164 start = ptoa(first_page + vm_page_array_size); 1165 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1166 /* 1167 * We have a segment that ends inside of vm_page_array, 1168 * but starts outside of it. 1169 */ 1170 end = ptoa(first_page); 1171 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1172 /* Since it's not possible to register such a range, panic. */ 1173 panic( 1174 "Unregistering not registered fictitious range [%#jx:%#jx]", 1175 (uintmax_t)start, (uintmax_t)end); 1176 } 1177 #endif 1178 tmp.start = start; 1179 tmp.end = 0; 1180 1181 rw_wlock(&vm_phys_fictitious_reg_lock); 1182 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1183 if (seg->start != start || seg->end != end) { 1184 rw_wunlock(&vm_phys_fictitious_reg_lock); 1185 panic( 1186 "Unregistering not registered fictitious range [%#jx:%#jx]", 1187 (uintmax_t)start, (uintmax_t)end); 1188 } 1189 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1190 rw_wunlock(&vm_phys_fictitious_reg_lock); 1191 free(seg->first_page, M_FICT_PAGES); 1192 free(seg, M_FICT_PAGES); 1193 } 1194 1195 /* 1196 * Free a contiguous, power of two-sized set of physical pages. 1197 * 1198 * The free page queues must be locked. 1199 */ 1200 void 1201 vm_phys_free_pages(vm_page_t m, int order) 1202 { 1203 struct vm_freelist *fl; 1204 struct vm_phys_seg *seg; 1205 vm_paddr_t pa; 1206 vm_page_t m_buddy; 1207 1208 KASSERT(m->order == VM_NFREEORDER, 1209 ("vm_phys_free_pages: page %p has unexpected order %d", 1210 m, m->order)); 1211 KASSERT(vm_phys_pool_valid(m->pool), 1212 ("vm_phys_free_pages: page %p has unexpected pool %d", 1213 m, m->pool)); 1214 KASSERT(order < VM_NFREEORDER, 1215 ("vm_phys_free_pages: order %d is out of range", order)); 1216 seg = &vm_phys_segs[m->segind]; 1217 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1218 if (order < VM_NFREEORDER - 1) { 1219 pa = VM_PAGE_TO_PHYS(m); 1220 do { 1221 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1222 if (pa < seg->start || pa >= seg->end) 1223 break; 1224 m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa); 1225 if (m_buddy->order != order) 1226 break; 1227 fl = (*seg->free_queues)[m_buddy->pool]; 1228 vm_freelist_rem(fl, m_buddy, order); 1229 if (m_buddy->pool != m->pool) 1230 vm_phys_set_pool(m->pool, m_buddy, order); 1231 order++; 1232 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1233 m = vm_phys_seg_paddr_to_vm_page(seg, pa); 1234 } while (order < VM_NFREEORDER - 1); 1235 } 1236 fl = (*seg->free_queues)[m->pool]; 1237 vm_freelist_add(fl, m, order, 1); 1238 } 1239 1240 #ifdef VM_FREEPOOL_LAZYINIT 1241 /* 1242 * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving 1243 * them to the default pool. This is a prerequisite for some rare operations 1244 * which need to scan the page array and thus depend on all pages being 1245 * initialized. 1246 */ 1247 static void 1248 vm_phys_lazy_init_domain(int domain, bool locked) 1249 { 1250 static bool initdone[MAXMEMDOM]; 1251 struct vm_domain *vmd; 1252 struct vm_freelist *fl; 1253 vm_page_t m; 1254 int pind; 1255 bool unlocked; 1256 1257 if (__predict_true(atomic_load_bool(&initdone[domain]))) 1258 return; 1259 1260 vmd = VM_DOMAIN(domain); 1261 if (locked) 1262 vm_domain_free_assert_locked(vmd); 1263 else 1264 vm_domain_free_lock(vmd); 1265 if (atomic_load_bool(&initdone[domain])) 1266 goto out; 1267 pind = VM_FREEPOOL_LAZYINIT; 1268 for (int freelist = 0; freelist < VM_NFREELIST; freelist++) { 1269 int flind; 1270 1271 flind = vm_freelist_to_flind[freelist]; 1272 if (flind < 0) 1273 continue; 1274 fl = vm_phys_free_queues[domain][flind][pind]; 1275 for (int oind = 0; oind < VM_NFREEORDER; oind++) { 1276 if (atomic_load_int(&fl[oind].lcnt) == 0) 1277 continue; 1278 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 1279 /* 1280 * Avoid holding the lock across the 1281 * initialization unless there's a free page 1282 * shortage. 1283 */ 1284 vm_freelist_rem(fl, m, oind); 1285 unlocked = vm_domain_allocate(vmd, 1286 VM_ALLOC_NORMAL, 1 << oind); 1287 if (unlocked) 1288 vm_domain_free_unlock(vmd); 1289 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1290 if (unlocked) { 1291 vm_domain_freecnt_inc(vmd, 1 << oind); 1292 vm_domain_free_lock(vmd); 1293 } 1294 vm_phys_free_pages(m, oind); 1295 } 1296 } 1297 } 1298 atomic_store_bool(&initdone[domain], true); 1299 out: 1300 if (!locked) 1301 vm_domain_free_unlock(vmd); 1302 } 1303 1304 static void 1305 vm_phys_lazy_init(void) 1306 { 1307 for (int domain = 0; domain < vm_ndomains; domain++) 1308 vm_phys_lazy_init_domain(domain, false); 1309 atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT); 1310 } 1311 1312 static void 1313 vm_phys_lazy_init_kthr(void *arg __unused) 1314 { 1315 vm_phys_lazy_init(); 1316 kthread_exit(); 1317 } 1318 1319 static void 1320 vm_phys_lazy_sysinit(void *arg __unused) 1321 { 1322 struct thread *td; 1323 int error; 1324 1325 error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td, 1326 RFSTOPPED, 0, "vmlazyinit"); 1327 if (error == 0) { 1328 thread_lock(td); 1329 sched_prio(td, PRI_MIN_IDLE); 1330 sched_add(td, SRQ_BORING); 1331 } else { 1332 printf("%s: could not create lazy init thread: %d\n", 1333 __func__, error); 1334 vm_phys_lazy_init(); 1335 } 1336 } 1337 SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit, 1338 NULL); 1339 #endif /* VM_FREEPOOL_LAZYINIT */ 1340 1341 /* 1342 * Free a contiguous, arbitrarily sized set of physical pages, without 1343 * merging across set boundaries. 1344 * 1345 * The free page queues must be locked. 1346 */ 1347 void 1348 vm_phys_enqueue_contig(vm_page_t m, u_long npages) 1349 { 1350 struct vm_freelist *fl; 1351 struct vm_phys_seg *seg; 1352 vm_page_t m_end; 1353 vm_paddr_t diff, lo; 1354 int order; 1355 1356 /* 1357 * Avoid unnecessary coalescing by freeing the pages in the largest 1358 * possible power-of-two-sized subsets. 1359 */ 1360 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1361 seg = &vm_phys_segs[m->segind]; 1362 fl = (*seg->free_queues)[m->pool]; 1363 m_end = m + npages; 1364 /* Free blocks of increasing size. */ 1365 lo = atop(VM_PAGE_TO_PHYS(m)); 1366 if (m < m_end && 1367 (diff = lo ^ (lo + npages - 1)) != 0) { 1368 order = min(ilog2(diff), VM_NFREEORDER - 1); 1369 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1); 1370 } 1371 1372 /* Free blocks of maximum size. */ 1373 order = VM_NFREEORDER - 1; 1374 while (m + (1 << order) <= m_end) { 1375 KASSERT(seg == &vm_phys_segs[m->segind], 1376 ("%s: page range [%p,%p) spans multiple segments", 1377 __func__, m_end - npages, m)); 1378 vm_phys_enq_chunk(fl, m, order, 1); 1379 m += 1 << order; 1380 } 1381 /* Free blocks of diminishing size. */ 1382 vm_phys_enq_beg(m, m_end - m, fl, 1); 1383 } 1384 1385 /* 1386 * Free a contiguous, arbitrarily sized set of physical pages. 1387 * 1388 * The free page queues must be locked. 1389 */ 1390 void 1391 vm_phys_free_contig(vm_page_t m, u_long npages) 1392 { 1393 vm_paddr_t lo; 1394 vm_page_t m_start, m_end; 1395 unsigned max_order, order_start, order_end; 1396 1397 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1398 1399 lo = atop(VM_PAGE_TO_PHYS(m)); 1400 max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1); 1401 1402 m_start = m; 1403 order_start = ffsll(lo) - 1; 1404 if (order_start < max_order) 1405 m_start += 1 << order_start; 1406 m_end = m + npages; 1407 order_end = ffsll(lo + npages) - 1; 1408 if (order_end < max_order) 1409 m_end -= 1 << order_end; 1410 /* 1411 * Avoid unnecessary coalescing by freeing the pages at the start and 1412 * end of the range last. 1413 */ 1414 if (m_start < m_end) 1415 vm_phys_enqueue_contig(m_start, m_end - m_start); 1416 if (order_start < max_order) 1417 vm_phys_free_pages(m, order_start); 1418 if (order_end < max_order) 1419 vm_phys_free_pages(m_end, order_end); 1420 } 1421 1422 /* 1423 * Identify the first address range within segment segind or greater 1424 * that matches the domain, lies within the low/high range, and has 1425 * enough pages. Return -1 if there is none. 1426 */ 1427 int 1428 vm_phys_find_range(vm_page_t bounds[], int segind, int domain, 1429 u_long npages, vm_paddr_t low, vm_paddr_t high) 1430 { 1431 vm_paddr_t pa_end, pa_start; 1432 struct vm_phys_seg *end_seg, *seg; 1433 1434 KASSERT(npages > 0, ("npages is zero")); 1435 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range")); 1436 end_seg = &vm_phys_segs[vm_phys_nsegs]; 1437 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) { 1438 if (seg->domain != domain) 1439 continue; 1440 if (seg->start >= high) 1441 return (-1); 1442 pa_start = MAX(low, seg->start); 1443 pa_end = MIN(high, seg->end); 1444 if (pa_end - pa_start < ptoa(npages)) 1445 continue; 1446 #ifdef VM_FREEPOOL_LAZYINIT 1447 /* 1448 * The pages on the free lists must be initialized. 1449 */ 1450 vm_phys_lazy_init_domain(domain, false); 1451 #endif 1452 bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start); 1453 bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; 1454 return (seg - vm_phys_segs); 1455 } 1456 return (-1); 1457 } 1458 1459 /* 1460 * Search for the given physical page "m" in the free lists. If the search 1461 * succeeds, remove "m" from the free lists and return true. Otherwise, return 1462 * false, indicating that "m" is not in the free lists. 1463 * 1464 * The free page queues must be locked. 1465 */ 1466 bool 1467 vm_phys_unfree_page(vm_paddr_t pa) 1468 { 1469 struct vm_freelist *fl; 1470 struct vm_phys_seg *seg; 1471 vm_paddr_t pa_half; 1472 vm_page_t m, m_set, m_tmp; 1473 int order; 1474 1475 seg = vm_phys_paddr_to_seg(pa); 1476 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1477 1478 /* 1479 * The pages on the free lists must be initialized. 1480 */ 1481 #ifdef VM_FREEPOOL_LAZYINIT 1482 vm_phys_lazy_init_domain(seg->domain, true); 1483 #endif 1484 1485 /* 1486 * First, find the contiguous, power of two-sized set of free 1487 * physical pages containing the given physical page "m" and 1488 * assign it to "m_set". 1489 */ 1490 m = vm_phys_paddr_to_vm_page(pa); 1491 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1492 order < VM_NFREEORDER - 1; ) { 1493 order++; 1494 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1495 if (pa >= seg->start) 1496 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa); 1497 else 1498 return (false); 1499 } 1500 if (m_set->order < order) 1501 return (false); 1502 if (m_set->order == VM_NFREEORDER) 1503 return (false); 1504 KASSERT(m_set->order < VM_NFREEORDER, 1505 ("vm_phys_unfree_page: page %p has unexpected order %d", 1506 m_set, m_set->order)); 1507 1508 /* 1509 * Next, remove "m_set" from the free lists. Finally, extract 1510 * "m" from "m_set" using an iterative algorithm: While "m_set" 1511 * is larger than a page, shrink "m_set" by returning the half 1512 * of "m_set" that does not contain "m" to the free lists. 1513 */ 1514 fl = (*seg->free_queues)[m_set->pool]; 1515 order = m_set->order; 1516 vm_freelist_rem(fl, m_set, order); 1517 while (order > 0) { 1518 order--; 1519 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1520 if (m->phys_addr < pa_half) 1521 m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1522 else { 1523 m_tmp = m_set; 1524 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1525 } 1526 vm_freelist_add(fl, m_tmp, order, 0); 1527 } 1528 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1529 return (true); 1530 } 1531 1532 /* 1533 * Find a run of contiguous physical pages, meeting alignment requirements, from 1534 * a list of max-sized page blocks, where we need at least two consecutive 1535 * blocks to satisfy the (large) page request. 1536 */ 1537 static vm_page_t 1538 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages, 1539 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1540 { 1541 struct vm_phys_seg *seg; 1542 vm_page_t m, m_iter, m_ret; 1543 vm_paddr_t max_size, size; 1544 int max_order; 1545 1546 max_order = VM_NFREEORDER - 1; 1547 size = npages << PAGE_SHIFT; 1548 max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order); 1549 KASSERT(size > max_size, ("size is too small")); 1550 1551 /* 1552 * In order to avoid examining any free max-sized page block more than 1553 * twice, identify the ones that are first in a physically-contiguous 1554 * sequence of such blocks, and only for those walk the sequence to 1555 * check if there are enough free blocks starting at a properly aligned 1556 * block. Thus, no block is checked for free-ness more than twice. 1557 */ 1558 TAILQ_FOREACH(m, &fl[max_order].pl, listq) { 1559 /* 1560 * Skip m unless it is first in a sequence of free max page 1561 * blocks >= low in its segment. 1562 */ 1563 seg = &vm_phys_segs[m->segind]; 1564 if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start)) 1565 continue; 1566 if (VM_PAGE_TO_PHYS(m) >= max_size && 1567 VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) && 1568 max_order == m[-1 << max_order].order) 1569 continue; 1570 1571 /* 1572 * Advance m_ret from m to the first of the sequence, if any, 1573 * that satisfies alignment conditions and might leave enough 1574 * space. 1575 */ 1576 m_ret = m; 1577 while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret), 1578 size, alignment, boundary) && 1579 VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) && 1580 max_order == m_ret[1 << max_order].order) 1581 m_ret += 1 << max_order; 1582 1583 /* 1584 * Skip m unless some block m_ret in the sequence is properly 1585 * aligned, and begins a sequence of enough pages less than 1586 * high, and in the same segment. 1587 */ 1588 if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end)) 1589 continue; 1590 1591 /* 1592 * Skip m unless the blocks to allocate starting at m_ret are 1593 * all free. 1594 */ 1595 for (m_iter = m_ret; 1596 m_iter < m_ret + npages && max_order == m_iter->order; 1597 m_iter += 1 << max_order) { 1598 } 1599 if (m_iter < m_ret + npages) 1600 continue; 1601 return (m_ret); 1602 } 1603 return (NULL); 1604 } 1605 1606 /* 1607 * Find a run of contiguous physical pages from the specified free list 1608 * table. 1609 */ 1610 static vm_page_t 1611 vm_phys_find_queues_contig( 1612 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], 1613 u_long npages, vm_paddr_t low, vm_paddr_t high, 1614 u_long alignment, vm_paddr_t boundary) 1615 { 1616 struct vm_freelist *fl; 1617 vm_page_t m_ret; 1618 vm_paddr_t pa, pa_end, size; 1619 int oind, order, pind; 1620 1621 KASSERT(npages > 0, ("npages is 0")); 1622 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1623 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1624 /* Compute the queue that is the best fit for npages. */ 1625 order = flsl(npages - 1); 1626 /* Search for a large enough free block. */ 1627 size = npages << PAGE_SHIFT; 1628 for (oind = order; oind < VM_NFREEORDER; oind++) { 1629 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1630 fl = (*queues)[pind]; 1631 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1632 /* 1633 * Determine if the address range starting at pa 1634 * is within the given range, satisfies the 1635 * given alignment, and does not cross the given 1636 * boundary. 1637 */ 1638 pa = VM_PAGE_TO_PHYS(m_ret); 1639 pa_end = pa + size; 1640 if (low <= pa && pa_end <= high && 1641 vm_addr_ok(pa, size, alignment, boundary)) 1642 return (m_ret); 1643 } 1644 } 1645 } 1646 if (order < VM_NFREEORDER) 1647 return (NULL); 1648 /* Search for a long-enough sequence of max-order blocks. */ 1649 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1650 fl = (*queues)[pind]; 1651 m_ret = vm_phys_find_freelist_contig(fl, npages, 1652 low, high, alignment, boundary); 1653 if (m_ret != NULL) 1654 return (m_ret); 1655 } 1656 return (NULL); 1657 } 1658 1659 /* 1660 * Allocate a contiguous set of physical pages of the given size 1661 * "npages" from the free lists. All of the physical pages must be at 1662 * or above the given physical address "low" and below the given 1663 * physical address "high". The given value "alignment" determines the 1664 * alignment of the first physical page in the set. If the given value 1665 * "boundary" is non-zero, then the set of physical pages cannot cross 1666 * any physical address boundary that is a multiple of that value. Both 1667 * "alignment" and "boundary" must be a power of two. 1668 */ 1669 vm_page_t 1670 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1671 u_long alignment, vm_paddr_t boundary) 1672 { 1673 vm_paddr_t pa_end, pa_start; 1674 struct vm_freelist *fl; 1675 vm_page_t m, m_run; 1676 struct vm_phys_seg *seg; 1677 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; 1678 int oind, segind; 1679 1680 KASSERT(npages > 0, ("npages is 0")); 1681 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1682 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1683 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1684 if (low >= high) 1685 return (NULL); 1686 queues = NULL; 1687 m_run = NULL; 1688 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1689 seg = &vm_phys_segs[segind]; 1690 if (seg->start >= high || seg->domain != domain) 1691 continue; 1692 if (low >= seg->end) 1693 break; 1694 if (low <= seg->start) 1695 pa_start = seg->start; 1696 else 1697 pa_start = low; 1698 if (high < seg->end) 1699 pa_end = high; 1700 else 1701 pa_end = seg->end; 1702 if (pa_end - pa_start < ptoa(npages)) 1703 continue; 1704 /* 1705 * If a previous segment led to a search using 1706 * the same free lists as would this segment, then 1707 * we've actually already searched within this 1708 * too. So skip it. 1709 */ 1710 if (seg->free_queues == queues) 1711 continue; 1712 queues = seg->free_queues; 1713 m_run = vm_phys_find_queues_contig(queues, npages, 1714 low, high, alignment, boundary); 1715 if (m_run != NULL) 1716 break; 1717 } 1718 if (m_run == NULL) 1719 return (NULL); 1720 1721 /* Allocate pages from the page-range found. */ 1722 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { 1723 fl = (*queues)[m->pool]; 1724 oind = m->order; 1725 vm_freelist_rem(fl, m, oind); 1726 if (m->pool != VM_FREEPOOL_DEFAULT) 1727 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1728 } 1729 /* Return excess pages to the free lists. */ 1730 fl = (*queues)[VM_FREEPOOL_DEFAULT]; 1731 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0); 1732 1733 /* Return page verified to satisfy conditions of request. */ 1734 pa_start = VM_PAGE_TO_PHYS(m_run); 1735 KASSERT(low <= pa_start, 1736 ("memory allocated below minimum requested range")); 1737 KASSERT(pa_start + ptoa(npages) <= high, 1738 ("memory allocated above maximum requested range")); 1739 seg = &vm_phys_segs[m_run->segind]; 1740 KASSERT(seg->domain == domain, 1741 ("memory not allocated from specified domain")); 1742 KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary), 1743 ("memory alignment/boundary constraints not satisfied")); 1744 return (m_run); 1745 } 1746 1747 /* 1748 * Return the index of the first unused slot which may be the terminating 1749 * entry. 1750 */ 1751 static int 1752 vm_phys_avail_count(void) 1753 { 1754 int i; 1755 1756 for (i = 0; phys_avail[i + 1]; i += 2) 1757 continue; 1758 if (i > PHYS_AVAIL_ENTRIES) 1759 panic("Improperly terminated phys_avail %d entries", i); 1760 1761 return (i); 1762 } 1763 1764 /* 1765 * Assert that a phys_avail entry is valid. 1766 */ 1767 static void 1768 vm_phys_avail_check(int i) 1769 { 1770 if (phys_avail[i] & PAGE_MASK) 1771 panic("Unaligned phys_avail[%d]: %#jx", i, 1772 (intmax_t)phys_avail[i]); 1773 if (phys_avail[i+1] & PAGE_MASK) 1774 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1775 (intmax_t)phys_avail[i]); 1776 if (phys_avail[i + 1] < phys_avail[i]) 1777 panic("phys_avail[%d] start %#jx < end %#jx", i, 1778 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1779 } 1780 1781 /* 1782 * Return the index of an overlapping phys_avail entry or -1. 1783 */ 1784 #ifdef NUMA 1785 static int 1786 vm_phys_avail_find(vm_paddr_t pa) 1787 { 1788 int i; 1789 1790 for (i = 0; phys_avail[i + 1]; i += 2) 1791 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1792 return (i); 1793 return (-1); 1794 } 1795 #endif 1796 1797 /* 1798 * Return the index of the largest entry. 1799 */ 1800 int 1801 vm_phys_avail_largest(void) 1802 { 1803 vm_paddr_t sz, largesz; 1804 int largest; 1805 int i; 1806 1807 largest = 0; 1808 largesz = 0; 1809 for (i = 0; phys_avail[i + 1]; i += 2) { 1810 sz = vm_phys_avail_size(i); 1811 if (sz > largesz) { 1812 largesz = sz; 1813 largest = i; 1814 } 1815 } 1816 1817 return (largest); 1818 } 1819 1820 vm_paddr_t 1821 vm_phys_avail_size(int i) 1822 { 1823 1824 return (phys_avail[i + 1] - phys_avail[i]); 1825 } 1826 1827 /* 1828 * Split an entry at the address 'pa'. Return zero on success or errno. 1829 */ 1830 static int 1831 vm_phys_avail_split(vm_paddr_t pa, int i) 1832 { 1833 int cnt; 1834 1835 vm_phys_avail_check(i); 1836 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1837 panic("vm_phys_avail_split: invalid address"); 1838 cnt = vm_phys_avail_count(); 1839 if (cnt >= PHYS_AVAIL_ENTRIES) 1840 return (ENOSPC); 1841 memmove(&phys_avail[i + 2], &phys_avail[i], 1842 (cnt - i) * sizeof(phys_avail[0])); 1843 phys_avail[i + 1] = pa; 1844 phys_avail[i + 2] = pa; 1845 vm_phys_avail_check(i); 1846 vm_phys_avail_check(i+2); 1847 1848 return (0); 1849 } 1850 1851 /* 1852 * Check if a given physical address can be included as part of a crash dump. 1853 */ 1854 bool 1855 vm_phys_is_dumpable(vm_paddr_t pa) 1856 { 1857 vm_page_t m; 1858 int i; 1859 1860 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1861 return ((m->flags & PG_NODUMP) == 0); 1862 1863 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1864 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1865 return (true); 1866 } 1867 return (false); 1868 } 1869 1870 void 1871 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1872 { 1873 struct vm_phys_seg *seg; 1874 1875 if (vm_phys_early_nsegs == -1) 1876 panic("%s: called after initialization", __func__); 1877 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1878 panic("%s: ran out of early segments", __func__); 1879 1880 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1881 seg->start = start; 1882 seg->end = end; 1883 } 1884 1885 /* 1886 * This routine allocates NUMA node specific memory before the page 1887 * allocator is bootstrapped. 1888 */ 1889 vm_paddr_t 1890 vm_phys_early_alloc(int domain, size_t alloc_size) 1891 { 1892 #ifdef NUMA 1893 int mem_index; 1894 #endif 1895 int i, biggestone; 1896 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1897 1898 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1899 ("%s: invalid domain index %d", __func__, domain)); 1900 1901 /* 1902 * Search the mem_affinity array for the biggest address 1903 * range in the desired domain. This is used to constrain 1904 * the phys_avail selection below. 1905 */ 1906 biggestsize = 0; 1907 mem_start = 0; 1908 mem_end = -1; 1909 #ifdef NUMA 1910 mem_index = 0; 1911 if (mem_affinity != NULL) { 1912 for (i = 0;; i++) { 1913 size = mem_affinity[i].end - mem_affinity[i].start; 1914 if (size == 0) 1915 break; 1916 if (domain != -1 && mem_affinity[i].domain != domain) 1917 continue; 1918 if (size > biggestsize) { 1919 mem_index = i; 1920 biggestsize = size; 1921 } 1922 } 1923 mem_start = mem_affinity[mem_index].start; 1924 mem_end = mem_affinity[mem_index].end; 1925 } 1926 #endif 1927 1928 /* 1929 * Now find biggest physical segment in within the desired 1930 * numa domain. 1931 */ 1932 biggestsize = 0; 1933 biggestone = 0; 1934 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1935 /* skip regions that are out of range */ 1936 if (phys_avail[i+1] - alloc_size < mem_start || 1937 phys_avail[i+1] > mem_end) 1938 continue; 1939 size = vm_phys_avail_size(i); 1940 if (size > biggestsize) { 1941 biggestone = i; 1942 biggestsize = size; 1943 } 1944 } 1945 alloc_size = round_page(alloc_size); 1946 1947 /* 1948 * Grab single pages from the front to reduce fragmentation. 1949 */ 1950 if (alloc_size == PAGE_SIZE) { 1951 pa = phys_avail[biggestone]; 1952 phys_avail[biggestone] += PAGE_SIZE; 1953 vm_phys_avail_check(biggestone); 1954 return (pa); 1955 } 1956 1957 /* 1958 * Naturally align large allocations. 1959 */ 1960 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1961 if (alloc_size + align > biggestsize) 1962 panic("cannot find a large enough size\n"); 1963 if (align != 0 && 1964 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1965 biggestone) != 0) 1966 /* Wasting memory. */ 1967 phys_avail[biggestone + 1] -= align; 1968 1969 phys_avail[biggestone + 1] -= alloc_size; 1970 vm_phys_avail_check(biggestone); 1971 pa = phys_avail[biggestone + 1]; 1972 return (pa); 1973 } 1974 1975 void 1976 vm_phys_early_startup(void) 1977 { 1978 struct vm_phys_seg *seg; 1979 int i; 1980 1981 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1982 phys_avail[i] = round_page(phys_avail[i]); 1983 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1984 } 1985 1986 for (i = 0; i < vm_phys_early_nsegs; i++) { 1987 seg = &vm_phys_early_segs[i]; 1988 vm_phys_add_seg(seg->start, seg->end); 1989 } 1990 vm_phys_early_nsegs = -1; 1991 1992 #ifdef NUMA 1993 /* Force phys_avail to be split by domain. */ 1994 if (mem_affinity != NULL) { 1995 int idx; 1996 1997 for (i = 0; mem_affinity[i].end != 0; i++) { 1998 idx = vm_phys_avail_find(mem_affinity[i].start); 1999 if (idx != -1 && 2000 phys_avail[idx] != mem_affinity[i].start) 2001 vm_phys_avail_split(mem_affinity[i].start, idx); 2002 idx = vm_phys_avail_find(mem_affinity[i].end); 2003 if (idx != -1 && 2004 phys_avail[idx] != mem_affinity[i].end) 2005 vm_phys_avail_split(mem_affinity[i].end, idx); 2006 } 2007 } 2008 #endif 2009 } 2010 2011 #ifdef DDB 2012 /* 2013 * Show the number of physical pages in each of the free lists. 2014 */ 2015 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) 2016 { 2017 struct vm_freelist *fl; 2018 int flind, oind, pind, dom; 2019 2020 for (dom = 0; dom < vm_ndomains; dom++) { 2021 db_printf("DOMAIN: %d\n", dom); 2022 for (flind = 0; flind < vm_nfreelists; flind++) { 2023 db_printf("FREE LIST %d:\n" 2024 "\n ORDER (SIZE) | NUMBER" 2025 "\n ", flind); 2026 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2027 db_printf(" | POOL %d", pind); 2028 db_printf("\n-- "); 2029 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2030 db_printf("-- -- "); 2031 db_printf("--\n"); 2032 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 2033 db_printf(" %2.2d (%6.6dK)", oind, 2034 1 << (PAGE_SHIFT - 10 + oind)); 2035 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 2036 fl = vm_phys_free_queues[dom][flind][pind]; 2037 db_printf(" | %6.6d", fl[oind].lcnt); 2038 } 2039 db_printf("\n"); 2040 } 2041 db_printf("\n"); 2042 } 2043 db_printf("\n"); 2044 } 2045 } 2046 #endif 2047