1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_vm.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/domainset.h> 48 #include <sys/lock.h> 49 #include <sys/kernel.h> 50 #include <sys/kthread.h> 51 #include <sys/malloc.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/queue.h> 55 #include <sys/rwlock.h> 56 #include <sys/sbuf.h> 57 #include <sys/sched.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/tslog.h> 61 #include <sys/unistd.h> 62 #include <sys/vmmeter.h> 63 64 #include <ddb/ddb.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_extern.h> 68 #include <vm/vm_param.h> 69 #include <vm/vm_kern.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_phys.h> 73 #include <vm/vm_pagequeue.h> 74 75 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 76 "Too many physsegs."); 77 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t), 78 "vm_paddr_t too big for ffsll, flsll."); 79 80 #ifdef NUMA 81 struct mem_affinity __read_mostly *mem_affinity; 82 int __read_mostly *mem_locality; 83 84 static int numa_disabled; 85 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 86 "NUMA options"); 87 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 88 &numa_disabled, 0, "NUMA-awareness in the allocators is disabled"); 89 #endif 90 91 int __read_mostly vm_ndomains = 1; 92 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 93 94 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 95 int __read_mostly vm_phys_nsegs; 96 static struct vm_phys_seg vm_phys_early_segs[8]; 97 static int vm_phys_early_nsegs; 98 99 struct vm_phys_fictitious_seg; 100 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 101 struct vm_phys_fictitious_seg *); 102 103 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 104 RB_INITIALIZER(&vm_phys_fictitious_tree); 105 106 struct vm_phys_fictitious_seg { 107 RB_ENTRY(vm_phys_fictitious_seg) node; 108 /* Memory region data */ 109 vm_paddr_t start; 110 vm_paddr_t end; 111 vm_page_t first_page; 112 }; 113 114 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 115 vm_phys_fictitious_cmp); 116 117 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 118 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 119 120 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 121 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 122 [VM_NFREEORDER_MAX]; 123 124 static int __read_mostly vm_nfreelists; 125 126 /* 127 * These "avail lists" are globals used to communicate boot-time physical 128 * memory layout to other parts of the kernel. Each physically contiguous 129 * region of memory is defined by a start address at an even index and an 130 * end address at the following odd index. Each list is terminated by a 131 * pair of zero entries. 132 * 133 * dump_avail tells the dump code what regions to include in a crash dump, and 134 * phys_avail is all of the remaining physical memory that is available for 135 * the vm system. 136 * 137 * Initially dump_avail and phys_avail are identical. Boot time memory 138 * allocations remove extents from phys_avail that may still be included 139 * in dumps. 140 */ 141 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 142 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 143 144 /* 145 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 146 */ 147 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 148 static int __read_mostly vm_default_freepool; 149 150 CTASSERT(VM_FREELIST_DEFAULT == 0); 151 152 #ifdef VM_FREELIST_DMA32 153 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 154 #endif 155 156 /* 157 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 158 * the ordering of the free list boundaries. 159 */ 160 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 161 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 162 #endif 163 164 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 165 SYSCTL_OID(_vm, OID_AUTO, phys_free, 166 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 167 sysctl_vm_phys_free, "A", 168 "Phys Free Info"); 169 170 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 171 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 172 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 173 sysctl_vm_phys_segs, "A", 174 "Phys Seg Info"); 175 176 #ifdef NUMA 177 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 178 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 179 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 180 sysctl_vm_phys_locality, "A", 181 "Phys Locality Info"); 182 #endif 183 184 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 185 &vm_ndomains, 0, "Number of physical memory domains available."); 186 187 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 188 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 189 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 190 int order, int pool, int tail); 191 192 static bool __diagused 193 vm_phys_pool_valid(int pool) 194 { 195 #ifdef VM_FREEPOOL_LAZYINIT 196 if (pool == VM_FREEPOOL_LAZYINIT) 197 return (false); 198 #endif 199 return (pool >= 0 && pool < VM_NFREEPOOL); 200 } 201 202 /* 203 * Red-black tree helpers for vm fictitious range management. 204 */ 205 static inline int 206 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 207 struct vm_phys_fictitious_seg *range) 208 { 209 210 KASSERT(range->start != 0 && range->end != 0, 211 ("Invalid range passed on search for vm_fictitious page")); 212 if (p->start >= range->end) 213 return (1); 214 if (p->start < range->start) 215 return (-1); 216 217 return (0); 218 } 219 220 static int 221 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 222 struct vm_phys_fictitious_seg *p2) 223 { 224 225 /* Check if this is a search for a page */ 226 if (p1->end == 0) 227 return (vm_phys_fictitious_in_range(p1, p2)); 228 229 KASSERT(p2->end != 0, 230 ("Invalid range passed as second parameter to vm fictitious comparison")); 231 232 /* Searching to add a new range */ 233 if (p1->end <= p2->start) 234 return (-1); 235 if (p1->start >= p2->end) 236 return (1); 237 238 panic("Trying to add overlapping vm fictitious ranges:\n" 239 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 240 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 241 } 242 243 int 244 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used, 245 vm_paddr_t high __numa_used) 246 { 247 #ifdef NUMA 248 domainset_t mask; 249 int i; 250 251 if (vm_ndomains == 1 || mem_affinity == NULL) 252 return (0); 253 254 DOMAINSET_ZERO(&mask); 255 /* 256 * Check for any memory that overlaps low, high. 257 */ 258 for (i = 0; mem_affinity[i].end != 0; i++) 259 if (mem_affinity[i].start <= high && 260 mem_affinity[i].end >= low) 261 DOMAINSET_SET(mem_affinity[i].domain, &mask); 262 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 263 return (prefer); 264 if (DOMAINSET_EMPTY(&mask)) 265 panic("vm_phys_domain_match: Impossible constraint"); 266 return (DOMAINSET_FFS(&mask) - 1); 267 #else 268 return (0); 269 #endif 270 } 271 272 /* 273 * Outputs the state of the physical memory allocator, specifically, 274 * the amount of physical memory in each free list. 275 */ 276 static int 277 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 278 { 279 struct sbuf sbuf; 280 struct vm_freelist *fl; 281 int dom, error, flind, oind, pind; 282 283 error = sysctl_wire_old_buffer(req, 0); 284 if (error != 0) 285 return (error); 286 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 287 for (dom = 0; dom < vm_ndomains; dom++) { 288 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 289 for (flind = 0; flind < vm_nfreelists; flind++) { 290 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 291 "\n ORDER (SIZE) | NUMBER" 292 "\n ", flind); 293 for (pind = 0; pind < VM_NFREEPOOL; pind++) 294 sbuf_printf(&sbuf, " | POOL %d", pind); 295 sbuf_printf(&sbuf, "\n-- "); 296 for (pind = 0; pind < VM_NFREEPOOL; pind++) 297 sbuf_printf(&sbuf, "-- -- "); 298 sbuf_printf(&sbuf, "--\n"); 299 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 300 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 301 1 << (PAGE_SHIFT - 10 + oind)); 302 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 303 fl = vm_phys_free_queues[dom][flind][pind]; 304 sbuf_printf(&sbuf, " | %6d", 305 fl[oind].lcnt); 306 } 307 sbuf_printf(&sbuf, "\n"); 308 } 309 } 310 } 311 error = sbuf_finish(&sbuf); 312 sbuf_delete(&sbuf); 313 return (error); 314 } 315 316 /* 317 * Outputs the set of physical memory segments. 318 */ 319 static int 320 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 321 { 322 struct sbuf sbuf; 323 struct vm_phys_seg *seg; 324 int error, segind; 325 326 error = sysctl_wire_old_buffer(req, 0); 327 if (error != 0) 328 return (error); 329 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 330 for (segind = 0; segind < vm_phys_nsegs; segind++) { 331 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 332 seg = &vm_phys_segs[segind]; 333 sbuf_printf(&sbuf, "start: %#jx\n", 334 (uintmax_t)seg->start); 335 sbuf_printf(&sbuf, "end: %#jx\n", 336 (uintmax_t)seg->end); 337 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 338 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 339 } 340 error = sbuf_finish(&sbuf); 341 sbuf_delete(&sbuf); 342 return (error); 343 } 344 345 /* 346 * Return affinity, or -1 if there's no affinity information. 347 */ 348 int 349 vm_phys_mem_affinity(int f __numa_used, int t __numa_used) 350 { 351 352 #ifdef NUMA 353 if (mem_locality == NULL) 354 return (-1); 355 if (f >= vm_ndomains || t >= vm_ndomains) 356 return (-1); 357 return (mem_locality[f * vm_ndomains + t]); 358 #else 359 return (-1); 360 #endif 361 } 362 363 #ifdef NUMA 364 /* 365 * Outputs the VM locality table. 366 */ 367 static int 368 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 369 { 370 struct sbuf sbuf; 371 int error, i, j; 372 373 error = sysctl_wire_old_buffer(req, 0); 374 if (error != 0) 375 return (error); 376 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 377 378 sbuf_printf(&sbuf, "\n"); 379 380 for (i = 0; i < vm_ndomains; i++) { 381 sbuf_printf(&sbuf, "%d: ", i); 382 for (j = 0; j < vm_ndomains; j++) { 383 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 384 } 385 sbuf_printf(&sbuf, "\n"); 386 } 387 error = sbuf_finish(&sbuf); 388 sbuf_delete(&sbuf); 389 return (error); 390 } 391 #endif 392 393 static void 394 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int pool, 395 int tail) 396 { 397 398 m->order = order; 399 m->pool = pool; 400 if (tail) 401 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 402 else 403 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 404 fl[order].lcnt++; 405 } 406 407 static void 408 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 409 { 410 411 TAILQ_REMOVE(&fl[order].pl, m, listq); 412 fl[order].lcnt--; 413 m->order = VM_NFREEORDER; 414 } 415 416 /* 417 * Create a physical memory segment. 418 */ 419 static void 420 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 421 { 422 struct vm_phys_seg *seg; 423 424 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 425 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 426 KASSERT(domain >= 0 && domain < vm_ndomains, 427 ("vm_phys_create_seg: invalid domain provided")); 428 seg = &vm_phys_segs[vm_phys_nsegs++]; 429 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 430 *seg = *(seg - 1); 431 seg--; 432 } 433 seg->start = start; 434 seg->end = end; 435 seg->domain = domain; 436 } 437 438 static void 439 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 440 { 441 #ifdef NUMA 442 int i; 443 444 if (mem_affinity == NULL) { 445 _vm_phys_create_seg(start, end, 0); 446 return; 447 } 448 449 for (i = 0;; i++) { 450 if (mem_affinity[i].end == 0) 451 panic("Reached end of affinity info"); 452 if (mem_affinity[i].end <= start) 453 continue; 454 if (mem_affinity[i].start > start) 455 panic("No affinity info for start %jx", 456 (uintmax_t)start); 457 if (mem_affinity[i].end >= end) { 458 _vm_phys_create_seg(start, end, 459 mem_affinity[i].domain); 460 break; 461 } 462 _vm_phys_create_seg(start, mem_affinity[i].end, 463 mem_affinity[i].domain); 464 start = mem_affinity[i].end; 465 } 466 #else 467 _vm_phys_create_seg(start, end, 0); 468 #endif 469 } 470 471 /* 472 * Add a physical memory segment. 473 */ 474 void 475 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 476 { 477 vm_paddr_t paddr; 478 479 KASSERT((start & PAGE_MASK) == 0, 480 ("vm_phys_define_seg: start is not page aligned")); 481 KASSERT((end & PAGE_MASK) == 0, 482 ("vm_phys_define_seg: end is not page aligned")); 483 484 /* 485 * Split the physical memory segment if it spans two or more free 486 * list boundaries. 487 */ 488 paddr = start; 489 #ifdef VM_FREELIST_LOWMEM 490 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 491 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 492 paddr = VM_LOWMEM_BOUNDARY; 493 } 494 #endif 495 #ifdef VM_FREELIST_DMA32 496 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 497 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 498 paddr = VM_DMA32_BOUNDARY; 499 } 500 #endif 501 vm_phys_create_seg(paddr, end); 502 } 503 504 /* 505 * Initialize the physical memory allocator. 506 * 507 * Requires that vm_page_array is initialized! 508 */ 509 void 510 vm_phys_init(void) 511 { 512 struct vm_freelist *fl; 513 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 514 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) 515 u_long npages; 516 #endif 517 int dom, flind, freelist, oind, pind, segind; 518 519 /* 520 * Compute the number of free lists, and generate the mapping from the 521 * manifest constants VM_FREELIST_* to the free list indices. 522 * 523 * Initially, the entries of vm_freelist_to_flind[] are set to either 524 * 0 or 1 to indicate which free lists should be created. 525 */ 526 #ifdef VM_DMA32_NPAGES_THRESHOLD 527 npages = 0; 528 #endif 529 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 530 seg = &vm_phys_segs[segind]; 531 #ifdef VM_FREELIST_LOWMEM 532 if (seg->end <= VM_LOWMEM_BOUNDARY) 533 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 534 else 535 #endif 536 #ifdef VM_FREELIST_DMA32 537 if ( 538 #ifdef VM_DMA32_NPAGES_THRESHOLD 539 /* 540 * Create the DMA32 free list only if the amount of 541 * physical memory above physical address 4G exceeds the 542 * given threshold. 543 */ 544 npages > VM_DMA32_NPAGES_THRESHOLD && 545 #endif 546 seg->end <= VM_DMA32_BOUNDARY) 547 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 548 else 549 #endif 550 { 551 #ifdef VM_DMA32_NPAGES_THRESHOLD 552 npages += atop(seg->end - seg->start); 553 #endif 554 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 555 } 556 } 557 /* Change each entry into a running total of the free lists. */ 558 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 559 vm_freelist_to_flind[freelist] += 560 vm_freelist_to_flind[freelist - 1]; 561 } 562 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 563 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 564 /* Change each entry into a free list index. */ 565 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 566 vm_freelist_to_flind[freelist]--; 567 568 /* 569 * Initialize the first_page and free_queues fields of each physical 570 * memory segment. 571 */ 572 #ifdef VM_PHYSSEG_SPARSE 573 npages = 0; 574 #endif 575 for (segind = 0; segind < vm_phys_nsegs; segind++) { 576 seg = &vm_phys_segs[segind]; 577 #ifdef VM_PHYSSEG_SPARSE 578 seg->first_page = &vm_page_array[npages]; 579 npages += atop(seg->end - seg->start); 580 #else 581 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 582 #endif 583 #ifdef VM_FREELIST_LOWMEM 584 if (seg->end <= VM_LOWMEM_BOUNDARY) { 585 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 586 KASSERT(flind >= 0, 587 ("vm_phys_init: LOWMEM flind < 0")); 588 } else 589 #endif 590 #ifdef VM_FREELIST_DMA32 591 if (seg->end <= VM_DMA32_BOUNDARY) { 592 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 593 KASSERT(flind >= 0, 594 ("vm_phys_init: DMA32 flind < 0")); 595 } else 596 #endif 597 { 598 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 599 KASSERT(flind >= 0, 600 ("vm_phys_init: DEFAULT flind < 0")); 601 } 602 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 603 } 604 605 /* 606 * Coalesce physical memory segments that are contiguous and share the 607 * same per-domain free queues. 608 */ 609 prev_seg = vm_phys_segs; 610 seg = &vm_phys_segs[1]; 611 end_seg = &vm_phys_segs[vm_phys_nsegs]; 612 while (seg < end_seg) { 613 if (prev_seg->end == seg->start && 614 prev_seg->free_queues == seg->free_queues) { 615 prev_seg->end = seg->end; 616 KASSERT(prev_seg->domain == seg->domain, 617 ("vm_phys_init: free queues cannot span domains")); 618 vm_phys_nsegs--; 619 end_seg--; 620 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 621 *tmp_seg = *(tmp_seg + 1); 622 } else { 623 prev_seg = seg; 624 seg++; 625 } 626 } 627 628 /* 629 * Initialize the free queues. 630 */ 631 for (dom = 0; dom < vm_ndomains; dom++) { 632 for (flind = 0; flind < vm_nfreelists; flind++) { 633 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 634 fl = vm_phys_free_queues[dom][flind][pind]; 635 for (oind = 0; oind < VM_NFREEORDER; oind++) 636 TAILQ_INIT(&fl[oind].pl); 637 } 638 } 639 } 640 641 #ifdef VM_FREEPOOL_LAZYINIT 642 vm_default_freepool = VM_FREEPOOL_LAZYINIT; 643 #else 644 vm_default_freepool = VM_FREEPOOL_DEFAULT; 645 #endif 646 647 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 648 } 649 650 /* 651 * Register info about the NUMA topology of the system. 652 * 653 * Invoked by platform-dependent code prior to vm_phys_init(). 654 */ 655 void 656 vm_phys_register_domains(int ndomains __numa_used, 657 struct mem_affinity *affinity __numa_used, int *locality __numa_used) 658 { 659 #ifdef NUMA 660 int i; 661 662 /* 663 * For now the only override value that we support is 1, which 664 * effectively disables NUMA-awareness in the allocators. 665 */ 666 TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled); 667 if (numa_disabled) 668 ndomains = 1; 669 670 if (ndomains > 1) { 671 vm_ndomains = ndomains; 672 mem_affinity = affinity; 673 mem_locality = locality; 674 } 675 676 for (i = 0; i < vm_ndomains; i++) 677 DOMAINSET_SET(i, &all_domains); 678 #endif 679 } 680 681 /* 682 * Split a contiguous, power of two-sized set of physical pages. 683 * 684 * When this function is called by a page allocation function, the caller 685 * should request insertion at the head unless the order [order, oind) queues 686 * are known to be empty. The objective being to reduce the likelihood of 687 * long-term fragmentation by promoting contemporaneous allocation and 688 * (hopefully) deallocation. 689 */ 690 static __inline void 691 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 692 int pool, int tail) 693 { 694 vm_page_t m_buddy; 695 696 while (oind > order) { 697 oind--; 698 m_buddy = &m[1 << oind]; 699 KASSERT(m_buddy->order == VM_NFREEORDER, 700 ("vm_phys_split_pages: page %p has unexpected order %d", 701 m_buddy, m_buddy->order)); 702 vm_freelist_add(fl, m_buddy, oind, pool, tail); 703 } 704 } 705 706 static void 707 vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int pool, 708 int tail) 709 { 710 KASSERT(order >= 0 && order < VM_NFREEORDER, 711 ("%s: invalid order %d", __func__, order)); 712 713 vm_freelist_add(fl, m, order, pool, tail); 714 #ifdef VM_FREEPOOL_LAZYINIT 715 if (__predict_false(pool == VM_FREEPOOL_LAZYINIT)) { 716 vm_page_t m_next; 717 vm_paddr_t pa; 718 int npages; 719 720 npages = 1 << order; 721 m_next = m + npages; 722 pa = m->phys_addr + ptoa(npages); 723 if (pa < vm_phys_segs[m->segind].end) { 724 vm_page_init_page(m_next, pa, m->segind, 725 VM_FREEPOOL_LAZYINIT); 726 } 727 } 728 #endif 729 } 730 731 /* 732 * Add the physical pages [m, m + npages) at the beginning of a power-of-two 733 * aligned and sized set to the specified free list. 734 * 735 * When this function is called by a page allocation function, the caller 736 * should request insertion at the head unless the lower-order queues are 737 * known to be empty. The objective being to reduce the likelihood of long- 738 * term fragmentation by promoting contemporaneous allocation and (hopefully) 739 * deallocation. 740 * 741 * The physical page m's buddy must not be free. 742 */ 743 static void 744 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool, 745 int tail) 746 { 747 int order; 748 749 KASSERT(npages == 0 || 750 (VM_PAGE_TO_PHYS(m) & 751 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 752 ("%s: page %p and npages %u are misaligned", 753 __func__, m, npages)); 754 while (npages > 0) { 755 KASSERT(m->order == VM_NFREEORDER, 756 ("%s: page %p has unexpected order %d", 757 __func__, m, m->order)); 758 order = ilog2(npages); 759 KASSERT(order < VM_NFREEORDER, 760 ("%s: order %d is out of range", __func__, order)); 761 vm_phys_enq_chunk(fl, m, order, pool, tail); 762 m += 1 << order; 763 npages -= 1 << order; 764 } 765 } 766 767 /* 768 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 769 * and sized set to the specified free list. 770 * 771 * When this function is called by a page allocation function, the caller 772 * should request insertion at the head unless the lower-order queues are 773 * known to be empty. The objective being to reduce the likelihood of long- 774 * term fragmentation by promoting contemporaneous allocation and (hopefully) 775 * deallocation. 776 * 777 * If npages is zero, this function does nothing and ignores the physical page 778 * parameter m. Otherwise, the physical page m's buddy must not be free. 779 */ 780 static vm_page_t 781 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool, 782 int tail) 783 { 784 int order; 785 786 KASSERT(npages == 0 || 787 ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 788 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 789 ("vm_phys_enq_range: page %p and npages %u are misaligned", 790 m, npages)); 791 while (npages > 0) { 792 KASSERT(m->order == VM_NFREEORDER, 793 ("vm_phys_enq_range: page %p has unexpected order %d", 794 m, m->order)); 795 order = ffs(npages) - 1; 796 vm_phys_enq_chunk(fl, m, order, pool, tail); 797 m += 1 << order; 798 npages -= 1 << order; 799 } 800 return (m); 801 } 802 803 /* 804 * Complete initialization a contiguous, power of two-sized set of physical 805 * pages. 806 * 807 * If the pages currently belong to the lazy init pool, then the corresponding 808 * page structures must be initialized. In this case it is assumed that the 809 * first page in the run has already been initialized. 810 */ 811 static void 812 vm_phys_finish_init(vm_page_t m, int order) 813 { 814 #ifdef VM_FREEPOOL_LAZYINIT 815 if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { 816 vm_paddr_t pa; 817 int segind; 818 819 TSENTER(); 820 pa = m->phys_addr + PAGE_SIZE; 821 segind = m->segind; 822 for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order]; 823 m_tmp++, pa += PAGE_SIZE) 824 vm_page_init_page(m_tmp, pa, segind, VM_NFREEPOOL); 825 TSEXIT(); 826 } 827 #endif 828 } 829 830 /* 831 * Tries to allocate the specified number of pages from the specified pool 832 * within the specified domain. Returns the actual number of allocated pages 833 * and a pointer to each page through the array ma[]. 834 * 835 * The returned pages may not be physically contiguous. However, in contrast 836 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 837 * calling this function once to allocate the desired number of pages will 838 * avoid wasted time in vm_phys_split_pages(). The allocated pages have no 839 * valid pool field set. 840 * 841 * The free page queues for the specified domain must be locked. 842 */ 843 int 844 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 845 { 846 struct vm_freelist *alt, *fl; 847 vm_page_t m; 848 int avail, end, flind, freelist, i, oind, pind; 849 850 KASSERT(domain >= 0 && domain < vm_ndomains, 851 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 852 KASSERT(vm_phys_pool_valid(pool), 853 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 854 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 855 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 856 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 857 i = 0; 858 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 859 flind = vm_freelist_to_flind[freelist]; 860 if (flind < 0) 861 continue; 862 fl = vm_phys_free_queues[domain][flind][pool]; 863 for (oind = 0; oind < VM_NFREEORDER; oind++) { 864 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 865 vm_freelist_rem(fl, m, oind); 866 avail = i + (1 << oind); 867 end = imin(npages, avail); 868 while (i < end) 869 ma[i++] = m++; 870 if (i == npages) { 871 /* 872 * Return excess pages to fl. Its order 873 * [0, oind) queues are empty. 874 */ 875 vm_phys_enq_range(m, avail - i, fl, 876 pool, 1); 877 return (npages); 878 } 879 } 880 } 881 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 882 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; 883 pind++) { 884 alt = vm_phys_free_queues[domain][flind][pind]; 885 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 886 NULL) { 887 vm_freelist_rem(alt, m, oind); 888 vm_phys_finish_init(m, oind); 889 avail = i + (1 << oind); 890 end = imin(npages, avail); 891 while (i < end) 892 ma[i++] = m++; 893 if (i == npages) { 894 /* 895 * Return excess pages to fl. 896 * Its order [0, oind) queues 897 * are empty. 898 */ 899 vm_phys_enq_range(m, avail - i, 900 fl, pool, 1); 901 return (npages); 902 } 903 } 904 } 905 } 906 } 907 return (i); 908 } 909 910 /* 911 * Allocate a contiguous, power of two-sized set of physical pages from the 912 * specified free list. The free list must be specified using one of the 913 * manifest constants VM_FREELIST_*. 914 * 915 * The free page queues must be locked. 916 */ 917 static vm_page_t 918 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 919 { 920 struct vm_freelist *alt, *fl; 921 vm_page_t m; 922 int oind, pind, flind; 923 924 KASSERT(domain >= 0 && domain < vm_ndomains, 925 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 926 domain)); 927 KASSERT(freelist < VM_NFREELIST, 928 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 929 freelist)); 930 KASSERT(vm_phys_pool_valid(pool), 931 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 932 KASSERT(order < VM_NFREEORDER, 933 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 934 935 flind = vm_freelist_to_flind[freelist]; 936 /* Check if freelist is present */ 937 if (flind < 0) 938 return (NULL); 939 940 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 941 fl = &vm_phys_free_queues[domain][flind][pool][0]; 942 for (oind = order; oind < VM_NFREEORDER; oind++) { 943 m = TAILQ_FIRST(&fl[oind].pl); 944 if (m != NULL) { 945 vm_freelist_rem(fl, m, oind); 946 /* The order [order, oind) queues are empty. */ 947 vm_phys_split_pages(m, oind, fl, order, pool, 1); 948 return (m); 949 } 950 } 951 952 /* 953 * The given pool was empty. Find the largest 954 * contiguous, power-of-two-sized set of pages in any 955 * pool. Transfer these pages to the given pool, and 956 * use them to satisfy the allocation. 957 */ 958 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 959 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 960 alt = &vm_phys_free_queues[domain][flind][pind][0]; 961 m = TAILQ_FIRST(&alt[oind].pl); 962 if (m != NULL) { 963 vm_freelist_rem(alt, m, oind); 964 vm_phys_finish_init(m, oind); 965 /* The order [order, oind) queues are empty. */ 966 vm_phys_split_pages(m, oind, fl, order, pool, 1); 967 return (m); 968 } 969 } 970 } 971 return (NULL); 972 } 973 974 /* 975 * Allocate a contiguous, power of two-sized set of physical pages 976 * from the free lists. 977 * 978 * The free page queues must be locked. 979 */ 980 vm_page_t 981 vm_phys_alloc_pages(int domain, int pool, int order) 982 { 983 vm_page_t m; 984 int freelist; 985 986 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 987 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 988 if (m != NULL) 989 return (m); 990 } 991 return (NULL); 992 } 993 994 /* 995 * Find the vm_page corresponding to the given physical address, which must lie 996 * within the given physical memory segment. 997 */ 998 vm_page_t 999 vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa) 1000 { 1001 KASSERT(pa >= seg->start && pa < seg->end, 1002 ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa)); 1003 1004 return (&seg->first_page[atop(pa - seg->start)]); 1005 } 1006 1007 /* 1008 * Find the vm_page corresponding to the given physical address. 1009 */ 1010 vm_page_t 1011 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 1012 { 1013 struct vm_phys_seg *seg; 1014 1015 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 1016 return (vm_phys_seg_paddr_to_vm_page(seg, pa)); 1017 return (NULL); 1018 } 1019 1020 vm_page_t 1021 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 1022 { 1023 struct vm_phys_fictitious_seg tmp, *seg; 1024 vm_page_t m; 1025 1026 m = NULL; 1027 tmp.start = pa; 1028 tmp.end = 0; 1029 1030 rw_rlock(&vm_phys_fictitious_reg_lock); 1031 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1032 rw_runlock(&vm_phys_fictitious_reg_lock); 1033 if (seg == NULL) 1034 return (NULL); 1035 1036 m = &seg->first_page[atop(pa - seg->start)]; 1037 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 1038 1039 return (m); 1040 } 1041 1042 static inline void 1043 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 1044 long page_count, vm_memattr_t memattr) 1045 { 1046 long i; 1047 1048 bzero(range, page_count * sizeof(*range)); 1049 for (i = 0; i < page_count; i++) { 1050 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 1051 range[i].oflags &= ~VPO_UNMANAGED; 1052 range[i].busy_lock = VPB_UNBUSIED; 1053 } 1054 } 1055 1056 int 1057 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 1058 vm_memattr_t memattr) 1059 { 1060 struct vm_phys_fictitious_seg *seg; 1061 vm_page_t fp; 1062 long page_count; 1063 #ifdef VM_PHYSSEG_DENSE 1064 long pi, pe; 1065 long dpage_count; 1066 #endif 1067 1068 KASSERT(start < end, 1069 ("Start of segment isn't less than end (start: %jx end: %jx)", 1070 (uintmax_t)start, (uintmax_t)end)); 1071 1072 page_count = (end - start) / PAGE_SIZE; 1073 1074 #ifdef VM_PHYSSEG_DENSE 1075 pi = atop(start); 1076 pe = atop(end); 1077 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1078 fp = &vm_page_array[pi - first_page]; 1079 if ((pe - first_page) > vm_page_array_size) { 1080 /* 1081 * We have a segment that starts inside 1082 * of vm_page_array, but ends outside of it. 1083 * 1084 * Use vm_page_array pages for those that are 1085 * inside of the vm_page_array range, and 1086 * allocate the remaining ones. 1087 */ 1088 dpage_count = vm_page_array_size - (pi - first_page); 1089 vm_phys_fictitious_init_range(fp, start, dpage_count, 1090 memattr); 1091 page_count -= dpage_count; 1092 start += ptoa(dpage_count); 1093 goto alloc; 1094 } 1095 /* 1096 * We can allocate the full range from vm_page_array, 1097 * so there's no need to register the range in the tree. 1098 */ 1099 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1100 return (0); 1101 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1102 /* 1103 * We have a segment that ends inside of vm_page_array, 1104 * but starts outside of it. 1105 */ 1106 fp = &vm_page_array[0]; 1107 dpage_count = pe - first_page; 1108 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 1109 memattr); 1110 end -= ptoa(dpage_count); 1111 page_count -= dpage_count; 1112 goto alloc; 1113 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1114 /* 1115 * Trying to register a fictitious range that expands before 1116 * and after vm_page_array. 1117 */ 1118 return (EINVAL); 1119 } else { 1120 alloc: 1121 #endif 1122 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1123 M_WAITOK); 1124 #ifdef VM_PHYSSEG_DENSE 1125 } 1126 #endif 1127 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1128 1129 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1130 seg->start = start; 1131 seg->end = end; 1132 seg->first_page = fp; 1133 1134 rw_wlock(&vm_phys_fictitious_reg_lock); 1135 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1136 rw_wunlock(&vm_phys_fictitious_reg_lock); 1137 1138 return (0); 1139 } 1140 1141 void 1142 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1143 { 1144 struct vm_phys_fictitious_seg *seg, tmp; 1145 #ifdef VM_PHYSSEG_DENSE 1146 long pi, pe; 1147 #endif 1148 1149 KASSERT(start < end, 1150 ("Start of segment isn't less than end (start: %jx end: %jx)", 1151 (uintmax_t)start, (uintmax_t)end)); 1152 1153 #ifdef VM_PHYSSEG_DENSE 1154 pi = atop(start); 1155 pe = atop(end); 1156 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1157 if ((pe - first_page) <= vm_page_array_size) { 1158 /* 1159 * This segment was allocated using vm_page_array 1160 * only, there's nothing to do since those pages 1161 * were never added to the tree. 1162 */ 1163 return; 1164 } 1165 /* 1166 * We have a segment that starts inside 1167 * of vm_page_array, but ends outside of it. 1168 * 1169 * Calculate how many pages were added to the 1170 * tree and free them. 1171 */ 1172 start = ptoa(first_page + vm_page_array_size); 1173 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1174 /* 1175 * We have a segment that ends inside of vm_page_array, 1176 * but starts outside of it. 1177 */ 1178 end = ptoa(first_page); 1179 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1180 /* Since it's not possible to register such a range, panic. */ 1181 panic( 1182 "Unregistering not registered fictitious range [%#jx:%#jx]", 1183 (uintmax_t)start, (uintmax_t)end); 1184 } 1185 #endif 1186 tmp.start = start; 1187 tmp.end = 0; 1188 1189 rw_wlock(&vm_phys_fictitious_reg_lock); 1190 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1191 if (seg->start != start || seg->end != end) { 1192 rw_wunlock(&vm_phys_fictitious_reg_lock); 1193 panic( 1194 "Unregistering not registered fictitious range [%#jx:%#jx]", 1195 (uintmax_t)start, (uintmax_t)end); 1196 } 1197 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1198 rw_wunlock(&vm_phys_fictitious_reg_lock); 1199 free(seg->first_page, M_FICT_PAGES); 1200 free(seg, M_FICT_PAGES); 1201 } 1202 1203 /* 1204 * Free a contiguous, power of two-sized set of physical pages. 1205 * The pool field in the first page determines the destination pool. 1206 * 1207 * The free page queues must be locked. 1208 */ 1209 void 1210 vm_phys_free_pages(vm_page_t m, int pool, int order) 1211 { 1212 struct vm_freelist *fl; 1213 struct vm_phys_seg *seg; 1214 vm_paddr_t pa; 1215 vm_page_t m_buddy; 1216 1217 KASSERT(m->order == VM_NFREEORDER, 1218 ("%s: page %p has unexpected order %d", 1219 __func__, m, m->order)); 1220 KASSERT(vm_phys_pool_valid(pool), 1221 ("%s: unexpected pool param %d", __func__, pool)); 1222 KASSERT(order < VM_NFREEORDER, 1223 ("%s: order %d is out of range", __func__, order)); 1224 seg = &vm_phys_segs[m->segind]; 1225 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1226 if (order < VM_NFREEORDER - 1) { 1227 pa = VM_PAGE_TO_PHYS(m); 1228 do { 1229 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1230 if (pa < seg->start || pa >= seg->end) 1231 break; 1232 m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa); 1233 if (m_buddy->order != order) 1234 break; 1235 fl = (*seg->free_queues)[m_buddy->pool]; 1236 vm_freelist_rem(fl, m_buddy, order); 1237 vm_phys_finish_init(m_buddy, order); 1238 order++; 1239 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1240 m = vm_phys_seg_paddr_to_vm_page(seg, pa); 1241 } while (order < VM_NFREEORDER - 1); 1242 } 1243 fl = (*seg->free_queues)[pool]; 1244 vm_freelist_add(fl, m, order, pool, 1); 1245 } 1246 1247 #ifdef VM_FREEPOOL_LAZYINIT 1248 /* 1249 * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving 1250 * them to the default pool. This is a prerequisite for some rare operations 1251 * which need to scan the page array and thus depend on all pages being 1252 * initialized. 1253 */ 1254 static void 1255 vm_phys_lazy_init_domain(int domain, bool locked) 1256 { 1257 static bool initdone[MAXMEMDOM]; 1258 struct vm_domain *vmd; 1259 struct vm_freelist *fl; 1260 vm_page_t m; 1261 int pind; 1262 bool unlocked; 1263 1264 if (__predict_true(atomic_load_bool(&initdone[domain]))) 1265 return; 1266 1267 vmd = VM_DOMAIN(domain); 1268 if (locked) 1269 vm_domain_free_assert_locked(vmd); 1270 else 1271 vm_domain_free_lock(vmd); 1272 if (atomic_load_bool(&initdone[domain])) 1273 goto out; 1274 pind = VM_FREEPOOL_LAZYINIT; 1275 for (int freelist = 0; freelist < VM_NFREELIST; freelist++) { 1276 int flind; 1277 1278 flind = vm_freelist_to_flind[freelist]; 1279 if (flind < 0) 1280 continue; 1281 fl = vm_phys_free_queues[domain][flind][pind]; 1282 for (int oind = 0; oind < VM_NFREEORDER; oind++) { 1283 if (atomic_load_int(&fl[oind].lcnt) == 0) 1284 continue; 1285 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 1286 /* 1287 * Avoid holding the lock across the 1288 * initialization unless there's a free page 1289 * shortage. 1290 */ 1291 vm_freelist_rem(fl, m, oind); 1292 unlocked = vm_domain_allocate(vmd, 1293 VM_ALLOC_NORMAL, 1 << oind); 1294 if (unlocked) 1295 vm_domain_free_unlock(vmd); 1296 vm_phys_finish_init(m, oind); 1297 if (unlocked) { 1298 vm_domain_freecnt_inc(vmd, 1 << oind); 1299 vm_domain_free_lock(vmd); 1300 } 1301 vm_phys_free_pages(m, VM_FREEPOOL_DEFAULT, 1302 oind); 1303 } 1304 } 1305 } 1306 atomic_store_bool(&initdone[domain], true); 1307 out: 1308 if (!locked) 1309 vm_domain_free_unlock(vmd); 1310 } 1311 1312 static void 1313 vm_phys_lazy_init(void) 1314 { 1315 for (int domain = 0; domain < vm_ndomains; domain++) 1316 vm_phys_lazy_init_domain(domain, false); 1317 atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT); 1318 } 1319 1320 static void 1321 vm_phys_lazy_init_kthr(void *arg __unused) 1322 { 1323 vm_phys_lazy_init(); 1324 kthread_exit(); 1325 } 1326 1327 static void 1328 vm_phys_lazy_sysinit(void *arg __unused) 1329 { 1330 struct thread *td; 1331 int error; 1332 1333 error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td, 1334 RFSTOPPED, 0, "vmlazyinit"); 1335 if (error == 0) { 1336 thread_lock(td); 1337 sched_prio(td, PRI_MIN_IDLE); 1338 sched_add(td, SRQ_BORING); 1339 } else { 1340 printf("%s: could not create lazy init thread: %d\n", 1341 __func__, error); 1342 vm_phys_lazy_init(); 1343 } 1344 } 1345 SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit, 1346 NULL); 1347 #endif /* VM_FREEPOOL_LAZYINIT */ 1348 1349 /* 1350 * Free a contiguous, arbitrarily sized set of physical pages, without 1351 * merging across set boundaries. Assumes no pages have a valid pool field. 1352 * 1353 * The free page queues must be locked. 1354 */ 1355 void 1356 vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages) 1357 { 1358 struct vm_freelist *fl; 1359 struct vm_phys_seg *seg; 1360 vm_page_t m_end; 1361 vm_paddr_t diff, lo; 1362 int order; 1363 1364 /* 1365 * Avoid unnecessary coalescing by freeing the pages in the largest 1366 * possible power-of-two-sized subsets. 1367 */ 1368 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1369 seg = &vm_phys_segs[m->segind]; 1370 fl = (*seg->free_queues)[pool]; 1371 m_end = m + npages; 1372 /* Free blocks of increasing size. */ 1373 lo = atop(VM_PAGE_TO_PHYS(m)); 1374 if (m < m_end && 1375 (diff = lo ^ (lo + npages - 1)) != 0) { 1376 order = min(ilog2(diff), VM_NFREEORDER - 1); 1377 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1378 pool, 1); 1379 } 1380 1381 /* Free blocks of maximum size. */ 1382 order = VM_NFREEORDER - 1; 1383 while (m + (1 << order) <= m_end) { 1384 KASSERT(seg == &vm_phys_segs[m->segind], 1385 ("%s: page range [%p,%p) spans multiple segments", 1386 __func__, m_end - npages, m)); 1387 vm_phys_enq_chunk(fl, m, order, pool, 1); 1388 m += 1 << order; 1389 } 1390 /* Free blocks of diminishing size. */ 1391 vm_phys_enq_beg(m, m_end - m, fl, pool, 1); 1392 } 1393 1394 /* 1395 * Free a contiguous, arbitrarily sized set of physical pages. 1396 * Assumes that every page but the first has no valid pool field. 1397 * Uses the pool value in the first page if valid, otherwise default. 1398 * 1399 * The free page queues must be locked. 1400 */ 1401 void 1402 vm_phys_free_contig(vm_page_t m, int pool, u_long npages) 1403 { 1404 vm_paddr_t lo; 1405 vm_page_t m_start, m_end; 1406 unsigned max_order, order_start, order_end; 1407 1408 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1409 1410 lo = atop(VM_PAGE_TO_PHYS(m)); 1411 max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1); 1412 1413 m_start = m; 1414 order_start = ffsll(lo) - 1; 1415 if (order_start < max_order) 1416 m_start += 1 << order_start; 1417 m_end = m + npages; 1418 order_end = ffsll(lo + npages) - 1; 1419 if (order_end < max_order) 1420 m_end -= 1 << order_end; 1421 /* 1422 * Avoid unnecessary coalescing by freeing the pages at the start and 1423 * end of the range last. 1424 */ 1425 if (m_start < m_end) 1426 vm_phys_enqueue_contig(m_start, pool, m_end - m_start); 1427 if (order_start < max_order) 1428 vm_phys_free_pages(m, pool, order_start); 1429 if (order_end < max_order) 1430 vm_phys_free_pages(m_end, pool, order_end); 1431 } 1432 1433 /* 1434 * Identify the first address range within segment segind or greater 1435 * that matches the domain, lies within the low/high range, and has 1436 * enough pages. Return -1 if there is none. 1437 */ 1438 int 1439 vm_phys_find_range(vm_page_t bounds[], int segind, int domain, 1440 u_long npages, vm_paddr_t low, vm_paddr_t high) 1441 { 1442 vm_paddr_t pa_end, pa_start; 1443 struct vm_phys_seg *end_seg, *seg; 1444 1445 KASSERT(npages > 0, ("npages is zero")); 1446 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range")); 1447 end_seg = &vm_phys_segs[vm_phys_nsegs]; 1448 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) { 1449 if (seg->domain != domain) 1450 continue; 1451 if (seg->start >= high) 1452 return (-1); 1453 pa_start = MAX(low, seg->start); 1454 pa_end = MIN(high, seg->end); 1455 if (pa_end - pa_start < ptoa(npages)) 1456 continue; 1457 #ifdef VM_FREEPOOL_LAZYINIT 1458 /* 1459 * The pages on the free lists must be initialized. 1460 */ 1461 vm_phys_lazy_init_domain(domain, false); 1462 #endif 1463 bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start); 1464 bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; 1465 return (seg - vm_phys_segs); 1466 } 1467 return (-1); 1468 } 1469 1470 /* 1471 * Search for the given physical page "m" in the free lists. If the search 1472 * succeeds, remove "m" from the free lists and return true. Otherwise, return 1473 * false, indicating that "m" is not in the free lists. 1474 * 1475 * The free page queues must be locked. 1476 */ 1477 bool 1478 vm_phys_unfree_page(vm_paddr_t pa) 1479 { 1480 struct vm_freelist *fl; 1481 struct vm_phys_seg *seg; 1482 vm_paddr_t pa_half; 1483 vm_page_t m, m_set, m_tmp; 1484 int order, pool; 1485 1486 seg = vm_phys_paddr_to_seg(pa); 1487 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1488 1489 #ifdef VM_FREEPOOL_LAZYINIT 1490 /* 1491 * The pages on the free lists must be initialized. 1492 */ 1493 vm_phys_lazy_init_domain(seg->domain, true); 1494 #endif 1495 1496 /* 1497 * First, find the contiguous, power of two-sized set of free 1498 * physical pages containing the given physical page "m" and 1499 * assign it to "m_set". 1500 */ 1501 m = vm_phys_paddr_to_vm_page(pa); 1502 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1503 order < VM_NFREEORDER - 1; ) { 1504 order++; 1505 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1506 if (pa >= seg->start) 1507 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa); 1508 else 1509 return (false); 1510 } 1511 if (m_set->order < order) 1512 return (false); 1513 if (m_set->order == VM_NFREEORDER) 1514 return (false); 1515 KASSERT(m_set->order < VM_NFREEORDER, 1516 ("vm_phys_unfree_page: page %p has unexpected order %d", 1517 m_set, m_set->order)); 1518 1519 /* 1520 * Next, remove "m_set" from the free lists. Finally, extract 1521 * "m" from "m_set" using an iterative algorithm: While "m_set" 1522 * is larger than a page, shrink "m_set" by returning the half 1523 * of "m_set" that does not contain "m" to the free lists. 1524 */ 1525 pool = m_set->pool; 1526 fl = (*seg->free_queues)[pool]; 1527 order = m_set->order; 1528 vm_freelist_rem(fl, m_set, order); 1529 while (order > 0) { 1530 order--; 1531 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1532 if (m->phys_addr < pa_half) 1533 m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1534 else { 1535 m_tmp = m_set; 1536 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1537 } 1538 vm_freelist_add(fl, m_tmp, order, pool, 0); 1539 } 1540 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1541 return (true); 1542 } 1543 1544 /* 1545 * Find a run of contiguous physical pages, meeting alignment requirements, from 1546 * a list of max-sized page blocks, where we need at least two consecutive 1547 * blocks to satisfy the (large) page request. 1548 */ 1549 static vm_page_t 1550 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages, 1551 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1552 { 1553 struct vm_phys_seg *seg; 1554 vm_page_t m, m_iter, m_ret; 1555 vm_paddr_t max_size, size; 1556 int max_order; 1557 1558 max_order = VM_NFREEORDER - 1; 1559 size = npages << PAGE_SHIFT; 1560 max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order); 1561 KASSERT(size > max_size, ("size is too small")); 1562 1563 /* 1564 * In order to avoid examining any free max-sized page block more than 1565 * twice, identify the ones that are first in a physically-contiguous 1566 * sequence of such blocks, and only for those walk the sequence to 1567 * check if there are enough free blocks starting at a properly aligned 1568 * block. Thus, no block is checked for free-ness more than twice. 1569 */ 1570 TAILQ_FOREACH(m, &fl[max_order].pl, listq) { 1571 /* 1572 * Skip m unless it is first in a sequence of free max page 1573 * blocks >= low in its segment. 1574 */ 1575 seg = &vm_phys_segs[m->segind]; 1576 if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start)) 1577 continue; 1578 if (VM_PAGE_TO_PHYS(m) >= max_size && 1579 VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) && 1580 max_order == m[-1 << max_order].order) 1581 continue; 1582 1583 /* 1584 * Advance m_ret from m to the first of the sequence, if any, 1585 * that satisfies alignment conditions and might leave enough 1586 * space. 1587 */ 1588 m_ret = m; 1589 while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret), 1590 size, alignment, boundary) && 1591 VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) && 1592 max_order == m_ret[1 << max_order].order) 1593 m_ret += 1 << max_order; 1594 1595 /* 1596 * Skip m unless some block m_ret in the sequence is properly 1597 * aligned, and begins a sequence of enough pages less than 1598 * high, and in the same segment. 1599 */ 1600 if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end)) 1601 continue; 1602 1603 /* 1604 * Skip m unless the blocks to allocate starting at m_ret are 1605 * all free. 1606 */ 1607 for (m_iter = m_ret; 1608 m_iter < m_ret + npages && max_order == m_iter->order; 1609 m_iter += 1 << max_order) { 1610 } 1611 if (m_iter < m_ret + npages) 1612 continue; 1613 return (m_ret); 1614 } 1615 return (NULL); 1616 } 1617 1618 /* 1619 * Find a run of contiguous physical pages from the specified free list 1620 * table. 1621 */ 1622 static vm_page_t 1623 vm_phys_find_queues_contig( 1624 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], 1625 u_long npages, vm_paddr_t low, vm_paddr_t high, 1626 u_long alignment, vm_paddr_t boundary) 1627 { 1628 struct vm_freelist *fl; 1629 vm_page_t m_ret; 1630 vm_paddr_t pa, pa_end, size; 1631 int oind, order, pind; 1632 1633 KASSERT(npages > 0, ("npages is 0")); 1634 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1635 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1636 /* Compute the queue that is the best fit for npages. */ 1637 order = flsl(npages - 1); 1638 /* Search for a large enough free block. */ 1639 size = npages << PAGE_SHIFT; 1640 for (oind = order; oind < VM_NFREEORDER; oind++) { 1641 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1642 fl = (*queues)[pind]; 1643 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1644 /* 1645 * Determine if the address range starting at pa 1646 * is within the given range, satisfies the 1647 * given alignment, and does not cross the given 1648 * boundary. 1649 */ 1650 pa = VM_PAGE_TO_PHYS(m_ret); 1651 pa_end = pa + size; 1652 if (low <= pa && pa_end <= high && 1653 vm_addr_ok(pa, size, alignment, boundary)) 1654 return (m_ret); 1655 } 1656 } 1657 } 1658 if (order < VM_NFREEORDER) 1659 return (NULL); 1660 /* Search for a long-enough sequence of max-order blocks. */ 1661 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1662 fl = (*queues)[pind]; 1663 m_ret = vm_phys_find_freelist_contig(fl, npages, 1664 low, high, alignment, boundary); 1665 if (m_ret != NULL) 1666 return (m_ret); 1667 } 1668 return (NULL); 1669 } 1670 1671 /* 1672 * Allocate a contiguous set of physical pages of the given size 1673 * "npages" from the free lists. All of the physical pages must be at 1674 * or above the given physical address "low" and below the given 1675 * physical address "high". The given value "alignment" determines the 1676 * alignment of the first physical page in the set. If the given value 1677 * "boundary" is non-zero, then the set of physical pages cannot cross 1678 * any physical address boundary that is a multiple of that value. Both 1679 * "alignment" and "boundary" must be a power of two. Sets the pool 1680 * field to DEFAULT in the first allocated page. 1681 */ 1682 vm_page_t 1683 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1684 u_long alignment, vm_paddr_t boundary) 1685 { 1686 vm_paddr_t pa_end, pa_start; 1687 struct vm_freelist *fl; 1688 vm_page_t m, m_run; 1689 struct vm_phys_seg *seg; 1690 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; 1691 int oind, segind; 1692 1693 KASSERT(npages > 0, ("npages is 0")); 1694 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1695 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1696 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1697 if (low >= high) 1698 return (NULL); 1699 queues = NULL; 1700 m_run = NULL; 1701 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1702 seg = &vm_phys_segs[segind]; 1703 if (seg->start >= high || seg->domain != domain) 1704 continue; 1705 if (low >= seg->end) 1706 break; 1707 if (low <= seg->start) 1708 pa_start = seg->start; 1709 else 1710 pa_start = low; 1711 if (high < seg->end) 1712 pa_end = high; 1713 else 1714 pa_end = seg->end; 1715 if (pa_end - pa_start < ptoa(npages)) 1716 continue; 1717 /* 1718 * If a previous segment led to a search using 1719 * the same free lists as would this segment, then 1720 * we've actually already searched within this 1721 * too. So skip it. 1722 */ 1723 if (seg->free_queues == queues) 1724 continue; 1725 queues = seg->free_queues; 1726 m_run = vm_phys_find_queues_contig(queues, npages, 1727 low, high, alignment, boundary); 1728 if (m_run != NULL) 1729 break; 1730 } 1731 if (m_run == NULL) 1732 return (NULL); 1733 1734 /* Allocate pages from the page-range found. */ 1735 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { 1736 fl = (*queues)[m->pool]; 1737 oind = m->order; 1738 vm_freelist_rem(fl, m, oind); 1739 vm_phys_finish_init(m, oind); 1740 } 1741 /* Return excess pages to the free lists. */ 1742 fl = (*queues)[VM_FREEPOOL_DEFAULT]; 1743 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 1744 VM_FREEPOOL_DEFAULT, 0); 1745 1746 /* Return page verified to satisfy conditions of request. */ 1747 pa_start = VM_PAGE_TO_PHYS(m_run); 1748 KASSERT(low <= pa_start, 1749 ("memory allocated below minimum requested range")); 1750 KASSERT(pa_start + ptoa(npages) <= high, 1751 ("memory allocated above maximum requested range")); 1752 seg = &vm_phys_segs[m_run->segind]; 1753 KASSERT(seg->domain == domain, 1754 ("memory not allocated from specified domain")); 1755 KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary), 1756 ("memory alignment/boundary constraints not satisfied")); 1757 return (m_run); 1758 } 1759 1760 /* 1761 * Return the index of the first unused slot which may be the terminating 1762 * entry. 1763 */ 1764 static int 1765 vm_phys_avail_count(void) 1766 { 1767 int i; 1768 1769 for (i = 0; phys_avail[i + 1]; i += 2) 1770 continue; 1771 if (i > PHYS_AVAIL_ENTRIES) 1772 panic("Improperly terminated phys_avail %d entries", i); 1773 1774 return (i); 1775 } 1776 1777 /* 1778 * Assert that a phys_avail entry is valid. 1779 */ 1780 static void 1781 vm_phys_avail_check(int i) 1782 { 1783 if (phys_avail[i] & PAGE_MASK) 1784 panic("Unaligned phys_avail[%d]: %#jx", i, 1785 (intmax_t)phys_avail[i]); 1786 if (phys_avail[i+1] & PAGE_MASK) 1787 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1788 (intmax_t)phys_avail[i]); 1789 if (phys_avail[i + 1] < phys_avail[i]) 1790 panic("phys_avail[%d] start %#jx < end %#jx", i, 1791 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1792 } 1793 1794 /* 1795 * Return the index of an overlapping phys_avail entry or -1. 1796 */ 1797 #ifdef NUMA 1798 static int 1799 vm_phys_avail_find(vm_paddr_t pa) 1800 { 1801 int i; 1802 1803 for (i = 0; phys_avail[i + 1]; i += 2) 1804 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1805 return (i); 1806 return (-1); 1807 } 1808 #endif 1809 1810 /* 1811 * Return the index of the largest entry. 1812 */ 1813 int 1814 vm_phys_avail_largest(void) 1815 { 1816 vm_paddr_t sz, largesz; 1817 int largest; 1818 int i; 1819 1820 largest = 0; 1821 largesz = 0; 1822 for (i = 0; phys_avail[i + 1]; i += 2) { 1823 sz = vm_phys_avail_size(i); 1824 if (sz > largesz) { 1825 largesz = sz; 1826 largest = i; 1827 } 1828 } 1829 1830 return (largest); 1831 } 1832 1833 vm_paddr_t 1834 vm_phys_avail_size(int i) 1835 { 1836 1837 return (phys_avail[i + 1] - phys_avail[i]); 1838 } 1839 1840 /* 1841 * Split an entry at the address 'pa'. Return zero on success or errno. 1842 */ 1843 static int 1844 vm_phys_avail_split(vm_paddr_t pa, int i) 1845 { 1846 int cnt; 1847 1848 vm_phys_avail_check(i); 1849 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1850 panic("vm_phys_avail_split: invalid address"); 1851 cnt = vm_phys_avail_count(); 1852 if (cnt >= PHYS_AVAIL_ENTRIES) 1853 return (ENOSPC); 1854 memmove(&phys_avail[i + 2], &phys_avail[i], 1855 (cnt - i) * sizeof(phys_avail[0])); 1856 phys_avail[i + 1] = pa; 1857 phys_avail[i + 2] = pa; 1858 vm_phys_avail_check(i); 1859 vm_phys_avail_check(i+2); 1860 1861 return (0); 1862 } 1863 1864 /* 1865 * Check if a given physical address can be included as part of a crash dump. 1866 */ 1867 bool 1868 vm_phys_is_dumpable(vm_paddr_t pa) 1869 { 1870 vm_page_t m; 1871 int i; 1872 1873 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1874 return ((m->flags & PG_NODUMP) == 0); 1875 1876 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1877 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1878 return (true); 1879 } 1880 return (false); 1881 } 1882 1883 void 1884 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1885 { 1886 struct vm_phys_seg *seg; 1887 1888 if (vm_phys_early_nsegs == -1) 1889 panic("%s: called after initialization", __func__); 1890 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1891 panic("%s: ran out of early segments", __func__); 1892 1893 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1894 seg->start = start; 1895 seg->end = end; 1896 } 1897 1898 /* 1899 * This routine allocates NUMA node specific memory before the page 1900 * allocator is bootstrapped. 1901 */ 1902 vm_paddr_t 1903 vm_phys_early_alloc(int domain, size_t alloc_size) 1904 { 1905 #ifdef NUMA 1906 int mem_index; 1907 #endif 1908 int i, biggestone; 1909 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1910 1911 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1912 ("%s: invalid domain index %d", __func__, domain)); 1913 1914 /* 1915 * Search the mem_affinity array for the biggest address 1916 * range in the desired domain. This is used to constrain 1917 * the phys_avail selection below. 1918 */ 1919 biggestsize = 0; 1920 mem_start = 0; 1921 mem_end = -1; 1922 #ifdef NUMA 1923 mem_index = 0; 1924 if (mem_affinity != NULL) { 1925 for (i = 0;; i++) { 1926 size = mem_affinity[i].end - mem_affinity[i].start; 1927 if (size == 0) 1928 break; 1929 if (domain != -1 && mem_affinity[i].domain != domain) 1930 continue; 1931 if (size > biggestsize) { 1932 mem_index = i; 1933 biggestsize = size; 1934 } 1935 } 1936 mem_start = mem_affinity[mem_index].start; 1937 mem_end = mem_affinity[mem_index].end; 1938 } 1939 #endif 1940 1941 /* 1942 * Now find biggest physical segment in within the desired 1943 * numa domain. 1944 */ 1945 biggestsize = 0; 1946 biggestone = 0; 1947 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1948 /* skip regions that are out of range */ 1949 if (phys_avail[i+1] - alloc_size < mem_start || 1950 phys_avail[i+1] > mem_end) 1951 continue; 1952 size = vm_phys_avail_size(i); 1953 if (size > biggestsize) { 1954 biggestone = i; 1955 biggestsize = size; 1956 } 1957 } 1958 alloc_size = round_page(alloc_size); 1959 1960 /* 1961 * Grab single pages from the front to reduce fragmentation. 1962 */ 1963 if (alloc_size == PAGE_SIZE) { 1964 pa = phys_avail[biggestone]; 1965 phys_avail[biggestone] += PAGE_SIZE; 1966 vm_phys_avail_check(biggestone); 1967 return (pa); 1968 } 1969 1970 /* 1971 * Naturally align large allocations. 1972 */ 1973 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1974 if (alloc_size + align > biggestsize) 1975 panic("cannot find a large enough size\n"); 1976 if (align != 0 && 1977 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1978 biggestone) != 0) 1979 /* Wasting memory. */ 1980 phys_avail[biggestone + 1] -= align; 1981 1982 phys_avail[biggestone + 1] -= alloc_size; 1983 vm_phys_avail_check(biggestone); 1984 pa = phys_avail[biggestone + 1]; 1985 return (pa); 1986 } 1987 1988 void 1989 vm_phys_early_startup(void) 1990 { 1991 struct vm_phys_seg *seg; 1992 int i; 1993 1994 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1995 phys_avail[i] = round_page(phys_avail[i]); 1996 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1997 } 1998 1999 for (i = 0; i < vm_phys_early_nsegs; i++) { 2000 seg = &vm_phys_early_segs[i]; 2001 vm_phys_add_seg(seg->start, seg->end); 2002 } 2003 vm_phys_early_nsegs = -1; 2004 2005 #ifdef NUMA 2006 /* Force phys_avail to be split by domain. */ 2007 if (mem_affinity != NULL) { 2008 int idx; 2009 2010 for (i = 0; mem_affinity[i].end != 0; i++) { 2011 idx = vm_phys_avail_find(mem_affinity[i].start); 2012 if (idx != -1 && 2013 phys_avail[idx] != mem_affinity[i].start) 2014 vm_phys_avail_split(mem_affinity[i].start, idx); 2015 idx = vm_phys_avail_find(mem_affinity[i].end); 2016 if (idx != -1 && 2017 phys_avail[idx] != mem_affinity[i].end) 2018 vm_phys_avail_split(mem_affinity[i].end, idx); 2019 } 2020 } 2021 #endif 2022 } 2023 2024 #ifdef DDB 2025 /* 2026 * Show the number of physical pages in each of the free lists. 2027 */ 2028 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) 2029 { 2030 struct vm_freelist *fl; 2031 int flind, oind, pind, dom; 2032 2033 for (dom = 0; dom < vm_ndomains; dom++) { 2034 db_printf("DOMAIN: %d\n", dom); 2035 for (flind = 0; flind < vm_nfreelists; flind++) { 2036 db_printf("FREE LIST %d:\n" 2037 "\n ORDER (SIZE) | NUMBER" 2038 "\n ", flind); 2039 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2040 db_printf(" | POOL %d", pind); 2041 db_printf("\n-- "); 2042 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2043 db_printf("-- -- "); 2044 db_printf("--\n"); 2045 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 2046 db_printf(" %2.2d (%6.6dK)", oind, 2047 1 << (PAGE_SHIFT - 10 + oind)); 2048 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 2049 fl = vm_phys_free_queues[dom][flind][pind]; 2050 db_printf(" | %6.6d", fl[oind].lcnt); 2051 } 2052 db_printf("\n"); 2053 } 2054 db_printf("\n"); 2055 } 2056 db_printf("\n"); 2057 } 2058 } 2059 #endif 2060