1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_vm.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/domainset.h> 48 #include <sys/lock.h> 49 #include <sys/kernel.h> 50 #include <sys/kthread.h> 51 #include <sys/malloc.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/queue.h> 55 #include <sys/rwlock.h> 56 #include <sys/sbuf.h> 57 #include <sys/sched.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/tslog.h> 61 #include <sys/unistd.h> 62 #include <sys/vmmeter.h> 63 64 #include <ddb/ddb.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_extern.h> 68 #include <vm/vm_param.h> 69 #include <vm/vm_kern.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_phys.h> 73 #include <vm/vm_pagequeue.h> 74 75 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 76 "Too many physsegs."); 77 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t), 78 "vm_paddr_t too big for ffsll, flsll."); 79 80 #ifdef NUMA 81 struct mem_affinity __read_mostly *mem_affinity; 82 int __read_mostly *mem_locality; 83 84 static int numa_disabled; 85 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 86 "NUMA options"); 87 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 88 &numa_disabled, 0, "NUMA-awareness in the allocators is disabled"); 89 #endif 90 91 int __read_mostly vm_ndomains = 1; 92 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 93 94 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 95 int __read_mostly vm_phys_nsegs; 96 static struct vm_phys_seg vm_phys_early_segs[8]; 97 static int vm_phys_early_nsegs; 98 99 struct vm_phys_fictitious_seg; 100 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 101 struct vm_phys_fictitious_seg *); 102 103 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 104 RB_INITIALIZER(&vm_phys_fictitious_tree); 105 106 struct vm_phys_fictitious_seg { 107 RB_ENTRY(vm_phys_fictitious_seg) node; 108 /* Memory region data */ 109 vm_paddr_t start; 110 vm_paddr_t end; 111 vm_page_t first_page; 112 }; 113 114 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 115 vm_phys_fictitious_cmp); 116 117 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 118 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 119 120 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 121 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 122 [VM_NFREEORDER_MAX]; 123 124 static int __read_mostly vm_nfreelists; 125 126 /* 127 * These "avail lists" are globals used to communicate boot-time physical 128 * memory layout to other parts of the kernel. Each physically contiguous 129 * region of memory is defined by a start address at an even index and an 130 * end address at the following odd index. Each list is terminated by a 131 * pair of zero entries. 132 * 133 * dump_avail tells the dump code what regions to include in a crash dump, and 134 * phys_avail is all of the remaining physical memory that is available for 135 * the vm system. 136 * 137 * Initially dump_avail and phys_avail are identical. Boot time memory 138 * allocations remove extents from phys_avail that may still be included 139 * in dumps. 140 */ 141 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 142 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 143 144 /* 145 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 146 */ 147 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 148 static int __read_mostly vm_default_freepool; 149 150 CTASSERT(VM_FREELIST_DEFAULT == 0); 151 152 #ifdef VM_FREELIST_DMA32 153 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 154 #endif 155 156 /* 157 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 158 * the ordering of the free list boundaries. 159 */ 160 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 161 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 162 #endif 163 164 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 165 SYSCTL_OID(_vm, OID_AUTO, phys_free, 166 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 167 sysctl_vm_phys_free, "A", 168 "Phys Free Info"); 169 170 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 171 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 172 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 173 sysctl_vm_phys_segs, "A", 174 "Phys Seg Info"); 175 176 #ifdef NUMA 177 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 178 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 179 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 180 sysctl_vm_phys_locality, "A", 181 "Phys Locality Info"); 182 #endif 183 184 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 185 &vm_ndomains, 0, "Number of physical memory domains available."); 186 187 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 188 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 189 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 190 int order, int pool, int tail); 191 192 static bool __diagused 193 vm_phys_pool_valid(int pool) 194 { 195 #ifdef VM_FREEPOOL_LAZYINIT 196 if (pool == VM_FREEPOOL_LAZYINIT) 197 return (false); 198 #endif 199 return (pool >= 0 && pool < VM_NFREEPOOL); 200 } 201 202 /* 203 * Red-black tree helpers for vm fictitious range management. 204 */ 205 static inline int 206 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 207 struct vm_phys_fictitious_seg *range) 208 { 209 210 KASSERT(range->start != 0 && range->end != 0, 211 ("Invalid range passed on search for vm_fictitious page")); 212 if (p->start >= range->end) 213 return (1); 214 if (p->start < range->start) 215 return (-1); 216 217 return (0); 218 } 219 220 static int 221 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 222 struct vm_phys_fictitious_seg *p2) 223 { 224 225 /* Check if this is a search for a page */ 226 if (p1->end == 0) 227 return (vm_phys_fictitious_in_range(p1, p2)); 228 229 KASSERT(p2->end != 0, 230 ("Invalid range passed as second parameter to vm fictitious comparison")); 231 232 /* Searching to add a new range */ 233 if (p1->end <= p2->start) 234 return (-1); 235 if (p1->start >= p2->end) 236 return (1); 237 238 panic("Trying to add overlapping vm fictitious ranges:\n" 239 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 240 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 241 } 242 243 int 244 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used, 245 vm_paddr_t high __numa_used) 246 { 247 #ifdef NUMA 248 domainset_t mask; 249 int i; 250 251 if (vm_ndomains == 1 || mem_affinity == NULL) 252 return (0); 253 254 DOMAINSET_ZERO(&mask); 255 /* 256 * Check for any memory that overlaps low, high. 257 */ 258 for (i = 0; mem_affinity[i].end != 0; i++) 259 if (mem_affinity[i].start <= high && 260 mem_affinity[i].end >= low) 261 DOMAINSET_SET(mem_affinity[i].domain, &mask); 262 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 263 return (prefer); 264 if (DOMAINSET_EMPTY(&mask)) 265 panic("vm_phys_domain_match: Impossible constraint"); 266 return (DOMAINSET_FFS(&mask) - 1); 267 #else 268 return (0); 269 #endif 270 } 271 272 /* 273 * Outputs the state of the physical memory allocator, specifically, 274 * the amount of physical memory in each free list. 275 */ 276 static int 277 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 278 { 279 struct sbuf sbuf; 280 struct vm_freelist *fl; 281 int dom, error, flind, oind, pind; 282 283 error = sysctl_wire_old_buffer(req, 0); 284 if (error != 0) 285 return (error); 286 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 287 for (dom = 0; dom < vm_ndomains; dom++) { 288 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 289 for (flind = 0; flind < vm_nfreelists; flind++) { 290 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 291 "\n ORDER (SIZE) | NUMBER" 292 "\n ", flind); 293 for (pind = 0; pind < VM_NFREEPOOL; pind++) 294 sbuf_printf(&sbuf, " | POOL %d", pind); 295 sbuf_printf(&sbuf, "\n-- "); 296 for (pind = 0; pind < VM_NFREEPOOL; pind++) 297 sbuf_printf(&sbuf, "-- -- "); 298 sbuf_printf(&sbuf, "--\n"); 299 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 300 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 301 1 << (PAGE_SHIFT - 10 + oind)); 302 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 303 fl = vm_phys_free_queues[dom][flind][pind]; 304 sbuf_printf(&sbuf, " | %6d", 305 fl[oind].lcnt); 306 } 307 sbuf_printf(&sbuf, "\n"); 308 } 309 } 310 } 311 error = sbuf_finish(&sbuf); 312 sbuf_delete(&sbuf); 313 return (error); 314 } 315 316 /* 317 * Outputs the set of physical memory segments. 318 */ 319 static int 320 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 321 { 322 struct sbuf sbuf; 323 struct vm_phys_seg *seg; 324 int error, segind; 325 326 error = sysctl_wire_old_buffer(req, 0); 327 if (error != 0) 328 return (error); 329 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 330 for (segind = 0; segind < vm_phys_nsegs; segind++) { 331 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 332 seg = &vm_phys_segs[segind]; 333 sbuf_printf(&sbuf, "start: %#jx\n", 334 (uintmax_t)seg->start); 335 sbuf_printf(&sbuf, "end: %#jx\n", 336 (uintmax_t)seg->end); 337 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 338 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 339 } 340 error = sbuf_finish(&sbuf); 341 sbuf_delete(&sbuf); 342 return (error); 343 } 344 345 /* 346 * Return affinity, or -1 if there's no affinity information. 347 */ 348 int 349 vm_phys_mem_affinity(int f __numa_used, int t __numa_used) 350 { 351 352 #ifdef NUMA 353 if (mem_locality == NULL) 354 return (-1); 355 if (f >= vm_ndomains || t >= vm_ndomains) 356 return (-1); 357 return (mem_locality[f * vm_ndomains + t]); 358 #else 359 return (-1); 360 #endif 361 } 362 363 #ifdef NUMA 364 /* 365 * Outputs the VM locality table. 366 */ 367 static int 368 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 369 { 370 struct sbuf sbuf; 371 int error, i, j; 372 373 error = sysctl_wire_old_buffer(req, 0); 374 if (error != 0) 375 return (error); 376 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 377 378 sbuf_printf(&sbuf, "\n"); 379 380 for (i = 0; i < vm_ndomains; i++) { 381 sbuf_printf(&sbuf, "%d: ", i); 382 for (j = 0; j < vm_ndomains; j++) { 383 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 384 } 385 sbuf_printf(&sbuf, "\n"); 386 } 387 error = sbuf_finish(&sbuf); 388 sbuf_delete(&sbuf); 389 return (error); 390 } 391 #endif 392 393 static void 394 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int pool, 395 int tail) 396 { 397 398 m->order = order; 399 m->pool = pool; 400 if (tail) 401 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 402 else 403 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 404 fl[order].lcnt++; 405 } 406 407 static void 408 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 409 { 410 411 TAILQ_REMOVE(&fl[order].pl, m, listq); 412 fl[order].lcnt--; 413 m->order = VM_NFREEORDER; 414 } 415 416 /* 417 * Create a physical memory segment. 418 */ 419 static void 420 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 421 { 422 struct vm_phys_seg *seg; 423 424 if (!(0 <= domain && domain < vm_ndomains)) 425 panic("%s: Invalid domain %d ('vm_ndomains' is %d)", 426 __func__, domain, vm_ndomains); 427 if (vm_phys_nsegs >= VM_PHYSSEG_MAX) 428 panic("Not enough storage for physical segments, " 429 "increase VM_PHYSSEG_MAX"); 430 431 seg = &vm_phys_segs[vm_phys_nsegs++]; 432 while (seg > vm_phys_segs && seg[-1].start >= end) { 433 *seg = *(seg - 1); 434 seg--; 435 } 436 seg->start = start; 437 seg->end = end; 438 seg->domain = domain; 439 if (seg != vm_phys_segs && seg[-1].end > start) 440 panic("Overlapping physical segments: Current [%#jx,%#jx) " 441 "at index %zu, previous [%#jx,%#jx)", 442 (uintmax_t)start, (uintmax_t)end, seg - vm_phys_segs, 443 (uintmax_t)seg[-1].start, (uintmax_t)seg[-1].end); 444 } 445 446 static void 447 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 448 { 449 #ifdef NUMA 450 int i; 451 452 if (mem_affinity == NULL) { 453 _vm_phys_create_seg(start, end, 0); 454 return; 455 } 456 457 for (i = 0;; i++) { 458 if (mem_affinity[i].end == 0) 459 panic("Reached end of affinity info"); 460 if (mem_affinity[i].end <= start) 461 continue; 462 if (mem_affinity[i].start > start) 463 panic("No affinity info for start %jx", 464 (uintmax_t)start); 465 if (mem_affinity[i].end >= end) { 466 _vm_phys_create_seg(start, end, 467 mem_affinity[i].domain); 468 break; 469 } 470 _vm_phys_create_seg(start, mem_affinity[i].end, 471 mem_affinity[i].domain); 472 start = mem_affinity[i].end; 473 } 474 #else 475 _vm_phys_create_seg(start, end, 0); 476 #endif 477 } 478 479 /* 480 * Add a physical memory segment. 481 */ 482 void 483 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 484 { 485 vm_paddr_t paddr; 486 487 if ((start & PAGE_MASK) != 0) 488 panic("%s: start (%jx) is not page aligned", __func__, 489 (uintmax_t)start); 490 if ((end & PAGE_MASK) != 0) 491 panic("%s: end (%jx) is not page aligned", __func__, 492 (uintmax_t)end); 493 if (start > end) 494 panic("%s: start (%jx) > end (%jx)!", __func__, 495 (uintmax_t)start, (uintmax_t)end); 496 497 if (start == end) 498 return; 499 500 /* 501 * Split the physical memory segment if it spans two or more free 502 * list boundaries. 503 */ 504 paddr = start; 505 #ifdef VM_FREELIST_LOWMEM 506 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 507 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 508 paddr = VM_LOWMEM_BOUNDARY; 509 } 510 #endif 511 #ifdef VM_FREELIST_DMA32 512 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 513 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 514 paddr = VM_DMA32_BOUNDARY; 515 } 516 #endif 517 vm_phys_create_seg(paddr, end); 518 } 519 520 /* 521 * Initialize the physical memory allocator. 522 * 523 * Requires that vm_page_array is initialized! 524 */ 525 void 526 vm_phys_init(void) 527 { 528 struct vm_freelist *fl; 529 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 530 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) 531 u_long npages; 532 #endif 533 int dom, flind, freelist, oind, pind, segind; 534 535 /* 536 * Compute the number of free lists, and generate the mapping from the 537 * manifest constants VM_FREELIST_* to the free list indices. 538 * 539 * Initially, the entries of vm_freelist_to_flind[] are set to either 540 * 0 or 1 to indicate which free lists should be created. 541 */ 542 #ifdef VM_DMA32_NPAGES_THRESHOLD 543 npages = 0; 544 #endif 545 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 546 seg = &vm_phys_segs[segind]; 547 #ifdef VM_FREELIST_LOWMEM 548 if (seg->end <= VM_LOWMEM_BOUNDARY) 549 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 550 else 551 #endif 552 #ifdef VM_FREELIST_DMA32 553 if ( 554 #ifdef VM_DMA32_NPAGES_THRESHOLD 555 /* 556 * Create the DMA32 free list only if the amount of 557 * physical memory above physical address 4G exceeds the 558 * given threshold. 559 */ 560 npages > VM_DMA32_NPAGES_THRESHOLD && 561 #endif 562 seg->end <= VM_DMA32_BOUNDARY) 563 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 564 else 565 #endif 566 { 567 #ifdef VM_DMA32_NPAGES_THRESHOLD 568 npages += atop(seg->end - seg->start); 569 #endif 570 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 571 } 572 } 573 /* Change each entry into a running total of the free lists. */ 574 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 575 vm_freelist_to_flind[freelist] += 576 vm_freelist_to_flind[freelist - 1]; 577 } 578 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 579 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 580 /* Change each entry into a free list index. */ 581 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 582 vm_freelist_to_flind[freelist]--; 583 584 /* 585 * Initialize the first_page and free_queues fields of each physical 586 * memory segment. 587 */ 588 #ifdef VM_PHYSSEG_SPARSE 589 npages = 0; 590 #endif 591 for (segind = 0; segind < vm_phys_nsegs; segind++) { 592 seg = &vm_phys_segs[segind]; 593 #ifdef VM_PHYSSEG_SPARSE 594 seg->first_page = &vm_page_array[npages]; 595 npages += atop(seg->end - seg->start); 596 #else 597 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 598 #endif 599 #ifdef VM_FREELIST_LOWMEM 600 if (seg->end <= VM_LOWMEM_BOUNDARY) { 601 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 602 KASSERT(flind >= 0, 603 ("vm_phys_init: LOWMEM flind < 0")); 604 } else 605 #endif 606 #ifdef VM_FREELIST_DMA32 607 if (seg->end <= VM_DMA32_BOUNDARY) { 608 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 609 KASSERT(flind >= 0, 610 ("vm_phys_init: DMA32 flind < 0")); 611 } else 612 #endif 613 { 614 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 615 KASSERT(flind >= 0, 616 ("vm_phys_init: DEFAULT flind < 0")); 617 } 618 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 619 } 620 621 /* 622 * Coalesce physical memory segments that are contiguous and share the 623 * same per-domain free queues. 624 */ 625 prev_seg = vm_phys_segs; 626 seg = &vm_phys_segs[1]; 627 end_seg = &vm_phys_segs[vm_phys_nsegs]; 628 while (seg < end_seg) { 629 if (prev_seg->end == seg->start && 630 prev_seg->free_queues == seg->free_queues) { 631 prev_seg->end = seg->end; 632 KASSERT(prev_seg->domain == seg->domain, 633 ("vm_phys_init: free queues cannot span domains")); 634 vm_phys_nsegs--; 635 end_seg--; 636 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 637 *tmp_seg = *(tmp_seg + 1); 638 } else { 639 prev_seg = seg; 640 seg++; 641 } 642 } 643 644 /* 645 * Initialize the free queues. 646 */ 647 for (dom = 0; dom < vm_ndomains; dom++) { 648 for (flind = 0; flind < vm_nfreelists; flind++) { 649 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 650 fl = vm_phys_free_queues[dom][flind][pind]; 651 for (oind = 0; oind < VM_NFREEORDER; oind++) 652 TAILQ_INIT(&fl[oind].pl); 653 } 654 } 655 } 656 657 #ifdef VM_FREEPOOL_LAZYINIT 658 vm_default_freepool = VM_FREEPOOL_LAZYINIT; 659 #else 660 vm_default_freepool = VM_FREEPOOL_DEFAULT; 661 #endif 662 663 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 664 } 665 666 /* 667 * Register info about the NUMA topology of the system. 668 * 669 * Invoked by platform-dependent code prior to vm_phys_init(). 670 */ 671 void 672 vm_phys_register_domains(int ndomains __numa_used, 673 struct mem_affinity *affinity __numa_used, int *locality __numa_used) 674 { 675 #ifdef NUMA 676 int i; 677 678 /* 679 * For now the only override value that we support is 1, which 680 * effectively disables NUMA-awareness in the allocators. 681 */ 682 TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled); 683 if (numa_disabled) 684 ndomains = 1; 685 686 if (ndomains > 1) { 687 vm_ndomains = ndomains; 688 mem_affinity = affinity; 689 mem_locality = locality; 690 } 691 692 for (i = 0; i < vm_ndomains; i++) 693 DOMAINSET_SET(i, &all_domains); 694 #endif 695 } 696 697 /* 698 * Split a contiguous, power of two-sized set of physical pages. 699 * 700 * When this function is called by a page allocation function, the caller 701 * should request insertion at the head unless the order [order, oind) queues 702 * are known to be empty. The objective being to reduce the likelihood of 703 * long-term fragmentation by promoting contemporaneous allocation and 704 * (hopefully) deallocation. 705 */ 706 static __inline void 707 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 708 int pool, int tail) 709 { 710 vm_page_t m_buddy; 711 712 while (oind > order) { 713 oind--; 714 m_buddy = &m[1 << oind]; 715 KASSERT(m_buddy->order == VM_NFREEORDER, 716 ("vm_phys_split_pages: page %p has unexpected order %d", 717 m_buddy, m_buddy->order)); 718 vm_freelist_add(fl, m_buddy, oind, pool, tail); 719 } 720 } 721 722 static void 723 vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int pool, 724 int tail) 725 { 726 KASSERT(order >= 0 && order < VM_NFREEORDER, 727 ("%s: invalid order %d", __func__, order)); 728 729 vm_freelist_add(fl, m, order, pool, tail); 730 #ifdef VM_FREEPOOL_LAZYINIT 731 if (__predict_false(pool == VM_FREEPOOL_LAZYINIT)) { 732 vm_page_t m_next; 733 vm_paddr_t pa; 734 int npages; 735 736 npages = 1 << order; 737 m_next = m + npages; 738 pa = m->phys_addr + ptoa(npages); 739 if (pa < vm_phys_segs[m->segind].end) { 740 vm_page_init_page(m_next, pa, m->segind, 741 VM_FREEPOOL_LAZYINIT); 742 } 743 } 744 #endif 745 } 746 747 /* 748 * Add the physical pages [m, m + npages) at the beginning of a power-of-two 749 * aligned and sized set to the specified free list. 750 * 751 * When this function is called by a page allocation function, the caller 752 * should request insertion at the head unless the lower-order queues are 753 * known to be empty. The objective being to reduce the likelihood of long- 754 * term fragmentation by promoting contemporaneous allocation and (hopefully) 755 * deallocation. 756 * 757 * The physical page m's buddy must not be free. 758 */ 759 static void 760 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool, 761 int tail) 762 { 763 int order; 764 765 KASSERT(npages == 0 || 766 (VM_PAGE_TO_PHYS(m) & 767 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 768 ("%s: page %p and npages %u are misaligned", 769 __func__, m, npages)); 770 while (npages > 0) { 771 KASSERT(m->order == VM_NFREEORDER, 772 ("%s: page %p has unexpected order %d", 773 __func__, m, m->order)); 774 order = ilog2(npages); 775 KASSERT(order < VM_NFREEORDER, 776 ("%s: order %d is out of range", __func__, order)); 777 vm_phys_enq_chunk(fl, m, order, pool, tail); 778 m += 1 << order; 779 npages -= 1 << order; 780 } 781 } 782 783 /* 784 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 785 * and sized set to the specified free list. 786 * 787 * When this function is called by a page allocation function, the caller 788 * should request insertion at the head unless the lower-order queues are 789 * known to be empty. The objective being to reduce the likelihood of long- 790 * term fragmentation by promoting contemporaneous allocation and (hopefully) 791 * deallocation. 792 * 793 * If npages is zero, this function does nothing and ignores the physical page 794 * parameter m. Otherwise, the physical page m's buddy must not be free. 795 */ 796 static vm_page_t 797 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool, 798 int tail) 799 { 800 int order; 801 802 KASSERT(npages == 0 || 803 ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 804 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0, 805 ("vm_phys_enq_range: page %p and npages %u are misaligned", 806 m, npages)); 807 while (npages > 0) { 808 KASSERT(m->order == VM_NFREEORDER, 809 ("vm_phys_enq_range: page %p has unexpected order %d", 810 m, m->order)); 811 order = ffs(npages) - 1; 812 vm_phys_enq_chunk(fl, m, order, pool, tail); 813 m += 1 << order; 814 npages -= 1 << order; 815 } 816 return (m); 817 } 818 819 /* 820 * Complete initialization a contiguous, power of two-sized set of physical 821 * pages. 822 * 823 * If the pages currently belong to the lazy init pool, then the corresponding 824 * page structures must be initialized. In this case it is assumed that the 825 * first page in the run has already been initialized. 826 */ 827 static void 828 vm_phys_finish_init(vm_page_t m, int order) 829 { 830 #ifdef VM_FREEPOOL_LAZYINIT 831 if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { 832 vm_paddr_t pa; 833 int segind; 834 835 TSENTER(); 836 pa = m->phys_addr + PAGE_SIZE; 837 segind = m->segind; 838 for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order]; 839 m_tmp++, pa += PAGE_SIZE) 840 vm_page_init_page(m_tmp, pa, segind, VM_NFREEPOOL); 841 TSEXIT(); 842 } 843 #endif 844 } 845 846 /* 847 * Tries to allocate the specified number of pages from the specified pool 848 * within the specified domain. Returns the actual number of allocated pages 849 * and a pointer to each page through the array ma[]. 850 * 851 * The returned pages may not be physically contiguous. However, in contrast 852 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 853 * calling this function once to allocate the desired number of pages will 854 * avoid wasted time in vm_phys_split_pages(). The allocated pages have no 855 * valid pool field set. 856 * 857 * The free page queues for the specified domain must be locked. 858 */ 859 int 860 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 861 { 862 struct vm_freelist *alt, *fl; 863 vm_page_t m; 864 int avail, end, flind, freelist, i, oind, pind; 865 866 KASSERT(domain >= 0 && domain < vm_ndomains, 867 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 868 KASSERT(vm_phys_pool_valid(pool), 869 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 870 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 871 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 872 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 873 i = 0; 874 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 875 flind = vm_freelist_to_flind[freelist]; 876 if (flind < 0) 877 continue; 878 fl = vm_phys_free_queues[domain][flind][pool]; 879 for (oind = 0; oind < VM_NFREEORDER; oind++) { 880 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 881 vm_freelist_rem(fl, m, oind); 882 avail = i + (1 << oind); 883 end = imin(npages, avail); 884 while (i < end) 885 ma[i++] = m++; 886 if (i == npages) { 887 /* 888 * Return excess pages to fl. Its order 889 * [0, oind) queues are empty. 890 */ 891 vm_phys_enq_range(m, avail - i, fl, 892 pool, 1); 893 return (npages); 894 } 895 } 896 } 897 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 898 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; 899 pind++) { 900 alt = vm_phys_free_queues[domain][flind][pind]; 901 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 902 NULL) { 903 vm_freelist_rem(alt, m, oind); 904 vm_phys_finish_init(m, oind); 905 avail = i + (1 << oind); 906 end = imin(npages, avail); 907 while (i < end) 908 ma[i++] = m++; 909 if (i == npages) { 910 /* 911 * Return excess pages to fl. 912 * Its order [0, oind) queues 913 * are empty. 914 */ 915 vm_phys_enq_range(m, avail - i, 916 fl, pool, 1); 917 return (npages); 918 } 919 } 920 } 921 } 922 } 923 return (i); 924 } 925 926 /* 927 * Allocate a contiguous, power of two-sized set of physical pages from the 928 * specified free list. The free list must be specified using one of the 929 * manifest constants VM_FREELIST_*. 930 * 931 * The free page queues must be locked. 932 */ 933 static vm_page_t 934 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 935 { 936 struct vm_freelist *alt, *fl; 937 vm_page_t m; 938 int oind, pind, flind; 939 940 KASSERT(domain >= 0 && domain < vm_ndomains, 941 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 942 domain)); 943 KASSERT(freelist < VM_NFREELIST, 944 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 945 freelist)); 946 KASSERT(vm_phys_pool_valid(pool), 947 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 948 KASSERT(order < VM_NFREEORDER, 949 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 950 951 flind = vm_freelist_to_flind[freelist]; 952 /* Check if freelist is present */ 953 if (flind < 0) 954 return (NULL); 955 956 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 957 fl = &vm_phys_free_queues[domain][flind][pool][0]; 958 for (oind = order; oind < VM_NFREEORDER; oind++) { 959 m = TAILQ_FIRST(&fl[oind].pl); 960 if (m != NULL) { 961 vm_freelist_rem(fl, m, oind); 962 /* The order [order, oind) queues are empty. */ 963 vm_phys_split_pages(m, oind, fl, order, pool, 1); 964 return (m); 965 } 966 } 967 968 /* 969 * The given pool was empty. Find the largest 970 * contiguous, power-of-two-sized set of pages in any 971 * pool. Transfer these pages to the given pool, and 972 * use them to satisfy the allocation. 973 */ 974 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 975 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 976 alt = &vm_phys_free_queues[domain][flind][pind][0]; 977 m = TAILQ_FIRST(&alt[oind].pl); 978 if (m != NULL) { 979 vm_freelist_rem(alt, m, oind); 980 vm_phys_finish_init(m, oind); 981 /* The order [order, oind) queues are empty. */ 982 vm_phys_split_pages(m, oind, fl, order, pool, 1); 983 return (m); 984 } 985 } 986 } 987 return (NULL); 988 } 989 990 /* 991 * Allocate a contiguous, power of two-sized set of physical pages 992 * from the free lists. 993 * 994 * The free page queues must be locked. 995 */ 996 vm_page_t 997 vm_phys_alloc_pages(int domain, int pool, int order) 998 { 999 vm_page_t m; 1000 int freelist; 1001 1002 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 1003 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 1004 if (m != NULL) 1005 return (m); 1006 } 1007 return (NULL); 1008 } 1009 1010 /* 1011 * Find the vm_page corresponding to the given physical address, which must lie 1012 * within the given physical memory segment. 1013 */ 1014 vm_page_t 1015 vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa) 1016 { 1017 KASSERT(pa >= seg->start && pa < seg->end, 1018 ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa)); 1019 1020 return (&seg->first_page[atop(pa - seg->start)]); 1021 } 1022 1023 /* 1024 * Find the vm_page corresponding to the given physical address. 1025 */ 1026 vm_page_t 1027 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 1028 { 1029 struct vm_phys_seg *seg; 1030 1031 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 1032 return (vm_phys_seg_paddr_to_vm_page(seg, pa)); 1033 return (NULL); 1034 } 1035 1036 vm_page_t 1037 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 1038 { 1039 struct vm_phys_fictitious_seg tmp, *seg; 1040 vm_page_t m; 1041 1042 m = NULL; 1043 tmp.start = pa; 1044 tmp.end = 0; 1045 1046 rw_rlock(&vm_phys_fictitious_reg_lock); 1047 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1048 rw_runlock(&vm_phys_fictitious_reg_lock); 1049 if (seg == NULL) 1050 return (NULL); 1051 1052 m = &seg->first_page[atop(pa - seg->start)]; 1053 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 1054 1055 return (m); 1056 } 1057 1058 static inline void 1059 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 1060 long page_count, vm_memattr_t memattr) 1061 { 1062 long i; 1063 1064 bzero(range, page_count * sizeof(*range)); 1065 for (i = 0; i < page_count; i++) { 1066 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 1067 range[i].oflags &= ~VPO_UNMANAGED; 1068 range[i].busy_lock = VPB_UNBUSIED; 1069 } 1070 } 1071 1072 int 1073 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 1074 vm_memattr_t memattr) 1075 { 1076 struct vm_phys_fictitious_seg *seg; 1077 vm_page_t fp; 1078 long page_count; 1079 #ifdef VM_PHYSSEG_DENSE 1080 long pi, pe; 1081 long dpage_count; 1082 #endif 1083 1084 KASSERT(start < end, 1085 ("Start of segment isn't less than end (start: %jx end: %jx)", 1086 (uintmax_t)start, (uintmax_t)end)); 1087 1088 page_count = (end - start) / PAGE_SIZE; 1089 1090 #ifdef VM_PHYSSEG_DENSE 1091 pi = atop(start); 1092 pe = atop(end); 1093 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1094 fp = &vm_page_array[pi - first_page]; 1095 if ((pe - first_page) > vm_page_array_size) { 1096 /* 1097 * We have a segment that starts inside 1098 * of vm_page_array, but ends outside of it. 1099 * 1100 * Use vm_page_array pages for those that are 1101 * inside of the vm_page_array range, and 1102 * allocate the remaining ones. 1103 */ 1104 dpage_count = vm_page_array_size - (pi - first_page); 1105 vm_phys_fictitious_init_range(fp, start, dpage_count, 1106 memattr); 1107 page_count -= dpage_count; 1108 start += ptoa(dpage_count); 1109 goto alloc; 1110 } 1111 /* 1112 * We can allocate the full range from vm_page_array, 1113 * so there's no need to register the range in the tree. 1114 */ 1115 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1116 return (0); 1117 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1118 /* 1119 * We have a segment that ends inside of vm_page_array, 1120 * but starts outside of it. 1121 */ 1122 fp = &vm_page_array[0]; 1123 dpage_count = pe - first_page; 1124 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 1125 memattr); 1126 end -= ptoa(dpage_count); 1127 page_count -= dpage_count; 1128 goto alloc; 1129 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1130 /* 1131 * Trying to register a fictitious range that expands before 1132 * and after vm_page_array. 1133 */ 1134 return (EINVAL); 1135 } else { 1136 alloc: 1137 #endif 1138 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1139 M_WAITOK); 1140 #ifdef VM_PHYSSEG_DENSE 1141 } 1142 #endif 1143 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1144 1145 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1146 seg->start = start; 1147 seg->end = end; 1148 seg->first_page = fp; 1149 1150 rw_wlock(&vm_phys_fictitious_reg_lock); 1151 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1152 rw_wunlock(&vm_phys_fictitious_reg_lock); 1153 1154 return (0); 1155 } 1156 1157 void 1158 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1159 { 1160 struct vm_phys_fictitious_seg *seg, tmp; 1161 #ifdef VM_PHYSSEG_DENSE 1162 long pi, pe; 1163 #endif 1164 1165 KASSERT(start < end, 1166 ("Start of segment isn't less than end (start: %jx end: %jx)", 1167 (uintmax_t)start, (uintmax_t)end)); 1168 1169 #ifdef VM_PHYSSEG_DENSE 1170 pi = atop(start); 1171 pe = atop(end); 1172 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1173 if ((pe - first_page) <= vm_page_array_size) { 1174 /* 1175 * This segment was allocated using vm_page_array 1176 * only, there's nothing to do since those pages 1177 * were never added to the tree. 1178 */ 1179 return; 1180 } 1181 /* 1182 * We have a segment that starts inside 1183 * of vm_page_array, but ends outside of it. 1184 * 1185 * Calculate how many pages were added to the 1186 * tree and free them. 1187 */ 1188 start = ptoa(first_page + vm_page_array_size); 1189 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1190 /* 1191 * We have a segment that ends inside of vm_page_array, 1192 * but starts outside of it. 1193 */ 1194 end = ptoa(first_page); 1195 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1196 /* Since it's not possible to register such a range, panic. */ 1197 panic( 1198 "Unregistering not registered fictitious range [%#jx:%#jx]", 1199 (uintmax_t)start, (uintmax_t)end); 1200 } 1201 #endif 1202 tmp.start = start; 1203 tmp.end = 0; 1204 1205 rw_wlock(&vm_phys_fictitious_reg_lock); 1206 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1207 if (seg->start != start || seg->end != end) { 1208 rw_wunlock(&vm_phys_fictitious_reg_lock); 1209 panic( 1210 "Unregistering not registered fictitious range [%#jx:%#jx]", 1211 (uintmax_t)start, (uintmax_t)end); 1212 } 1213 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1214 rw_wunlock(&vm_phys_fictitious_reg_lock); 1215 free(seg->first_page, M_FICT_PAGES); 1216 free(seg, M_FICT_PAGES); 1217 } 1218 1219 /* 1220 * Free a contiguous, power of two-sized set of physical pages. 1221 * The pool field in the first page determines the destination pool. 1222 * 1223 * The free page queues must be locked. 1224 */ 1225 void 1226 vm_phys_free_pages(vm_page_t m, int pool, int order) 1227 { 1228 struct vm_freelist *fl; 1229 struct vm_phys_seg *seg; 1230 vm_paddr_t pa; 1231 vm_page_t m_buddy; 1232 1233 KASSERT(m->order == VM_NFREEORDER, 1234 ("%s: page %p has unexpected order %d", 1235 __func__, m, m->order)); 1236 KASSERT(vm_phys_pool_valid(pool), 1237 ("%s: unexpected pool param %d", __func__, pool)); 1238 KASSERT(order < VM_NFREEORDER, 1239 ("%s: order %d is out of range", __func__, order)); 1240 seg = &vm_phys_segs[m->segind]; 1241 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1242 if (order < VM_NFREEORDER - 1) { 1243 pa = VM_PAGE_TO_PHYS(m); 1244 do { 1245 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1246 if (pa < seg->start || pa >= seg->end) 1247 break; 1248 m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa); 1249 if (m_buddy->order != order) 1250 break; 1251 fl = (*seg->free_queues)[m_buddy->pool]; 1252 vm_freelist_rem(fl, m_buddy, order); 1253 vm_phys_finish_init(m_buddy, order); 1254 order++; 1255 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1256 m = vm_phys_seg_paddr_to_vm_page(seg, pa); 1257 } while (order < VM_NFREEORDER - 1); 1258 } 1259 fl = (*seg->free_queues)[pool]; 1260 vm_freelist_add(fl, m, order, pool, 1); 1261 } 1262 1263 #ifdef VM_FREEPOOL_LAZYINIT 1264 /* 1265 * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving 1266 * them to the default pool. This is a prerequisite for some rare operations 1267 * which need to scan the page array and thus depend on all pages being 1268 * initialized. 1269 */ 1270 static void 1271 vm_phys_lazy_init_domain(int domain, bool locked) 1272 { 1273 static bool initdone[MAXMEMDOM]; 1274 struct vm_domain *vmd; 1275 struct vm_freelist *fl; 1276 vm_page_t m; 1277 int pind; 1278 bool unlocked; 1279 1280 if (__predict_true(atomic_load_bool(&initdone[domain]))) 1281 return; 1282 1283 vmd = VM_DOMAIN(domain); 1284 if (locked) 1285 vm_domain_free_assert_locked(vmd); 1286 else 1287 vm_domain_free_lock(vmd); 1288 if (atomic_load_bool(&initdone[domain])) 1289 goto out; 1290 pind = VM_FREEPOOL_LAZYINIT; 1291 for (int freelist = 0; freelist < VM_NFREELIST; freelist++) { 1292 int flind; 1293 1294 flind = vm_freelist_to_flind[freelist]; 1295 if (flind < 0) 1296 continue; 1297 fl = vm_phys_free_queues[domain][flind][pind]; 1298 for (int oind = 0; oind < VM_NFREEORDER; oind++) { 1299 if (atomic_load_int(&fl[oind].lcnt) == 0) 1300 continue; 1301 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 1302 /* 1303 * Avoid holding the lock across the 1304 * initialization unless there's a free page 1305 * shortage. 1306 */ 1307 vm_freelist_rem(fl, m, oind); 1308 unlocked = vm_domain_allocate(vmd, 1309 VM_ALLOC_NORMAL, 1 << oind); 1310 if (unlocked) 1311 vm_domain_free_unlock(vmd); 1312 vm_phys_finish_init(m, oind); 1313 if (unlocked) { 1314 vm_domain_freecnt_inc(vmd, 1 << oind); 1315 vm_domain_free_lock(vmd); 1316 } 1317 vm_phys_free_pages(m, VM_FREEPOOL_DEFAULT, 1318 oind); 1319 } 1320 } 1321 } 1322 atomic_store_bool(&initdone[domain], true); 1323 out: 1324 if (!locked) 1325 vm_domain_free_unlock(vmd); 1326 } 1327 1328 static void 1329 vm_phys_lazy_init(void) 1330 { 1331 for (int domain = 0; domain < vm_ndomains; domain++) 1332 vm_phys_lazy_init_domain(domain, false); 1333 atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT); 1334 } 1335 1336 static void 1337 vm_phys_lazy_init_kthr(void *arg __unused) 1338 { 1339 vm_phys_lazy_init(); 1340 kthread_exit(); 1341 } 1342 1343 static void 1344 vm_phys_lazy_sysinit(void *arg __unused) 1345 { 1346 struct thread *td; 1347 int error; 1348 1349 error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td, 1350 RFSTOPPED, 0, "vmlazyinit"); 1351 if (error == 0) { 1352 thread_lock(td); 1353 sched_prio(td, PRI_MIN_IDLE); 1354 sched_add(td, SRQ_BORING); 1355 } else { 1356 printf("%s: could not create lazy init thread: %d\n", 1357 __func__, error); 1358 vm_phys_lazy_init(); 1359 } 1360 } 1361 SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit, 1362 NULL); 1363 #endif /* VM_FREEPOOL_LAZYINIT */ 1364 1365 /* 1366 * Free a contiguous, arbitrarily sized set of physical pages, without 1367 * merging across set boundaries. Assumes no pages have a valid pool field. 1368 * 1369 * The free page queues must be locked. 1370 */ 1371 void 1372 vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages) 1373 { 1374 struct vm_freelist *fl; 1375 struct vm_phys_seg *seg; 1376 vm_page_t m_end; 1377 vm_paddr_t diff, lo; 1378 int order; 1379 1380 /* 1381 * Avoid unnecessary coalescing by freeing the pages in the largest 1382 * possible power-of-two-sized subsets. 1383 */ 1384 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1385 seg = &vm_phys_segs[m->segind]; 1386 fl = (*seg->free_queues)[pool]; 1387 m_end = m + npages; 1388 /* Free blocks of increasing size. */ 1389 lo = atop(VM_PAGE_TO_PHYS(m)); 1390 if (m < m_end && 1391 (diff = lo ^ (lo + npages - 1)) != 0) { 1392 order = min(ilog2(diff), VM_NFREEORDER - 1); 1393 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1394 pool, 1); 1395 } 1396 1397 /* Free blocks of maximum size. */ 1398 order = VM_NFREEORDER - 1; 1399 while (m + (1 << order) <= m_end) { 1400 KASSERT(seg == &vm_phys_segs[m->segind], 1401 ("%s: page range [%p,%p) spans multiple segments", 1402 __func__, m_end - npages, m)); 1403 vm_phys_enq_chunk(fl, m, order, pool, 1); 1404 m += 1 << order; 1405 } 1406 /* Free blocks of diminishing size. */ 1407 vm_phys_enq_beg(m, m_end - m, fl, pool, 1); 1408 } 1409 1410 /* 1411 * Free a contiguous, arbitrarily sized set of physical pages. 1412 * Assumes that every page but the first has no valid pool field. 1413 * Uses the pool value in the first page if valid, otherwise default. 1414 * 1415 * The free page queues must be locked. 1416 */ 1417 void 1418 vm_phys_free_contig(vm_page_t m, int pool, u_long npages) 1419 { 1420 vm_paddr_t lo; 1421 vm_page_t m_start, m_end; 1422 unsigned max_order, order_start, order_end; 1423 1424 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1425 1426 lo = atop(VM_PAGE_TO_PHYS(m)); 1427 max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1); 1428 1429 m_start = m; 1430 order_start = ffsll(lo) - 1; 1431 if (order_start < max_order) 1432 m_start += 1 << order_start; 1433 m_end = m + npages; 1434 order_end = ffsll(lo + npages) - 1; 1435 if (order_end < max_order) 1436 m_end -= 1 << order_end; 1437 /* 1438 * Avoid unnecessary coalescing by freeing the pages at the start and 1439 * end of the range last. 1440 */ 1441 if (m_start < m_end) 1442 vm_phys_enqueue_contig(m_start, pool, m_end - m_start); 1443 if (order_start < max_order) 1444 vm_phys_free_pages(m, pool, order_start); 1445 if (order_end < max_order) 1446 vm_phys_free_pages(m_end, pool, order_end); 1447 } 1448 1449 /* 1450 * Identify the first address range within segment segind or greater 1451 * that matches the domain, lies within the low/high range, and has 1452 * enough pages. Return -1 if there is none. 1453 */ 1454 int 1455 vm_phys_find_range(vm_page_t bounds[], int segind, int domain, 1456 u_long npages, vm_paddr_t low, vm_paddr_t high) 1457 { 1458 vm_paddr_t pa_end, pa_start; 1459 struct vm_phys_seg *end_seg, *seg; 1460 1461 KASSERT(npages > 0, ("npages is zero")); 1462 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range")); 1463 end_seg = &vm_phys_segs[vm_phys_nsegs]; 1464 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) { 1465 if (seg->domain != domain) 1466 continue; 1467 if (seg->start >= high) 1468 return (-1); 1469 pa_start = MAX(low, seg->start); 1470 pa_end = MIN(high, seg->end); 1471 if (pa_end - pa_start < ptoa(npages)) 1472 continue; 1473 #ifdef VM_FREEPOOL_LAZYINIT 1474 /* 1475 * The pages on the free lists must be initialized. 1476 */ 1477 vm_phys_lazy_init_domain(domain, false); 1478 #endif 1479 bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start); 1480 bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; 1481 return (seg - vm_phys_segs); 1482 } 1483 return (-1); 1484 } 1485 1486 /* 1487 * Search for the given physical page "m" in the free lists. If the search 1488 * succeeds, remove "m" from the free lists and return true. Otherwise, return 1489 * false, indicating that "m" is not in the free lists. 1490 * 1491 * The free page queues must be locked. 1492 */ 1493 bool 1494 vm_phys_unfree_page(vm_paddr_t pa) 1495 { 1496 struct vm_freelist *fl; 1497 struct vm_phys_seg *seg; 1498 vm_paddr_t pa_half; 1499 vm_page_t m, m_set, m_tmp; 1500 int order, pool; 1501 1502 seg = vm_phys_paddr_to_seg(pa); 1503 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1504 1505 #ifdef VM_FREEPOOL_LAZYINIT 1506 /* 1507 * The pages on the free lists must be initialized. 1508 */ 1509 vm_phys_lazy_init_domain(seg->domain, true); 1510 #endif 1511 1512 /* 1513 * First, find the contiguous, power of two-sized set of free 1514 * physical pages containing the given physical page "m" and 1515 * assign it to "m_set". 1516 */ 1517 m = vm_phys_paddr_to_vm_page(pa); 1518 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1519 order < VM_NFREEORDER - 1; ) { 1520 order++; 1521 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1522 if (pa >= seg->start) 1523 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa); 1524 else 1525 return (false); 1526 } 1527 if (m_set->order < order) 1528 return (false); 1529 if (m_set->order == VM_NFREEORDER) 1530 return (false); 1531 KASSERT(m_set->order < VM_NFREEORDER, 1532 ("vm_phys_unfree_page: page %p has unexpected order %d", 1533 m_set, m_set->order)); 1534 1535 /* 1536 * Next, remove "m_set" from the free lists. Finally, extract 1537 * "m" from "m_set" using an iterative algorithm: While "m_set" 1538 * is larger than a page, shrink "m_set" by returning the half 1539 * of "m_set" that does not contain "m" to the free lists. 1540 */ 1541 pool = m_set->pool; 1542 fl = (*seg->free_queues)[pool]; 1543 order = m_set->order; 1544 vm_freelist_rem(fl, m_set, order); 1545 while (order > 0) { 1546 order--; 1547 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1548 if (m->phys_addr < pa_half) 1549 m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1550 else { 1551 m_tmp = m_set; 1552 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half); 1553 } 1554 vm_freelist_add(fl, m_tmp, order, pool, 0); 1555 } 1556 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1557 return (true); 1558 } 1559 1560 /* 1561 * Find a run of contiguous physical pages, meeting alignment requirements, from 1562 * a list of max-sized page blocks, where we need at least two consecutive 1563 * blocks to satisfy the (large) page request. 1564 */ 1565 static vm_page_t 1566 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages, 1567 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1568 { 1569 struct vm_phys_seg *seg; 1570 vm_page_t m, m_iter, m_ret; 1571 vm_paddr_t max_size, size; 1572 int max_order; 1573 1574 max_order = VM_NFREEORDER - 1; 1575 size = npages << PAGE_SHIFT; 1576 max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order); 1577 KASSERT(size > max_size, ("size is too small")); 1578 1579 /* 1580 * In order to avoid examining any free max-sized page block more than 1581 * twice, identify the ones that are first in a physically-contiguous 1582 * sequence of such blocks, and only for those walk the sequence to 1583 * check if there are enough free blocks starting at a properly aligned 1584 * block. Thus, no block is checked for free-ness more than twice. 1585 */ 1586 TAILQ_FOREACH(m, &fl[max_order].pl, listq) { 1587 /* 1588 * Skip m unless it is first in a sequence of free max page 1589 * blocks >= low in its segment. 1590 */ 1591 seg = &vm_phys_segs[m->segind]; 1592 if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start)) 1593 continue; 1594 if (VM_PAGE_TO_PHYS(m) >= max_size && 1595 VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) && 1596 max_order == m[-1 << max_order].order) 1597 continue; 1598 1599 /* 1600 * Advance m_ret from m to the first of the sequence, if any, 1601 * that satisfies alignment conditions and might leave enough 1602 * space. 1603 */ 1604 m_ret = m; 1605 while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret), 1606 size, alignment, boundary) && 1607 VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) && 1608 max_order == m_ret[1 << max_order].order) 1609 m_ret += 1 << max_order; 1610 1611 /* 1612 * Skip m unless some block m_ret in the sequence is properly 1613 * aligned, and begins a sequence of enough pages less than 1614 * high, and in the same segment. 1615 */ 1616 if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end)) 1617 continue; 1618 1619 /* 1620 * Skip m unless the blocks to allocate starting at m_ret are 1621 * all free. 1622 */ 1623 for (m_iter = m_ret; 1624 m_iter < m_ret + npages && max_order == m_iter->order; 1625 m_iter += 1 << max_order) { 1626 } 1627 if (m_iter < m_ret + npages) 1628 continue; 1629 return (m_ret); 1630 } 1631 return (NULL); 1632 } 1633 1634 /* 1635 * Find a run of contiguous physical pages from the specified free list 1636 * table. 1637 */ 1638 static vm_page_t 1639 vm_phys_find_queues_contig( 1640 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], 1641 u_long npages, vm_paddr_t low, vm_paddr_t high, 1642 u_long alignment, vm_paddr_t boundary) 1643 { 1644 struct vm_freelist *fl; 1645 vm_page_t m_ret; 1646 vm_paddr_t pa, pa_end, size; 1647 int oind, order, pind; 1648 1649 KASSERT(npages > 0, ("npages is 0")); 1650 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1651 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1652 /* Compute the queue that is the best fit for npages. */ 1653 order = flsl(npages - 1); 1654 /* Search for a large enough free block. */ 1655 size = npages << PAGE_SHIFT; 1656 for (oind = order; oind < VM_NFREEORDER; oind++) { 1657 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1658 fl = (*queues)[pind]; 1659 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1660 /* 1661 * Determine if the address range starting at pa 1662 * is within the given range, satisfies the 1663 * given alignment, and does not cross the given 1664 * boundary. 1665 */ 1666 pa = VM_PAGE_TO_PHYS(m_ret); 1667 pa_end = pa + size; 1668 if (low <= pa && pa_end <= high && 1669 vm_addr_ok(pa, size, alignment, boundary)) 1670 return (m_ret); 1671 } 1672 } 1673 } 1674 if (order < VM_NFREEORDER) 1675 return (NULL); 1676 /* Search for a long-enough sequence of max-order blocks. */ 1677 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { 1678 fl = (*queues)[pind]; 1679 m_ret = vm_phys_find_freelist_contig(fl, npages, 1680 low, high, alignment, boundary); 1681 if (m_ret != NULL) 1682 return (m_ret); 1683 } 1684 return (NULL); 1685 } 1686 1687 /* 1688 * Allocate a contiguous set of physical pages of the given size 1689 * "npages" from the free lists. All of the physical pages must be at 1690 * or above the given physical address "low" and below the given 1691 * physical address "high". The given value "alignment" determines the 1692 * alignment of the first physical page in the set. If the given value 1693 * "boundary" is non-zero, then the set of physical pages cannot cross 1694 * any physical address boundary that is a multiple of that value. Both 1695 * "alignment" and "boundary" must be a power of two. Sets the pool 1696 * field to DEFAULT in the first allocated page. 1697 */ 1698 vm_page_t 1699 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1700 u_long alignment, vm_paddr_t boundary) 1701 { 1702 vm_paddr_t pa_end, pa_start; 1703 struct vm_freelist *fl; 1704 vm_page_t m, m_run; 1705 struct vm_phys_seg *seg; 1706 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; 1707 int oind, segind; 1708 1709 KASSERT(npages > 0, ("npages is 0")); 1710 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1711 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1712 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1713 if (low >= high) 1714 return (NULL); 1715 queues = NULL; 1716 m_run = NULL; 1717 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1718 seg = &vm_phys_segs[segind]; 1719 if (seg->start >= high || seg->domain != domain) 1720 continue; 1721 if (low >= seg->end) 1722 break; 1723 if (low <= seg->start) 1724 pa_start = seg->start; 1725 else 1726 pa_start = low; 1727 if (high < seg->end) 1728 pa_end = high; 1729 else 1730 pa_end = seg->end; 1731 if (pa_end - pa_start < ptoa(npages)) 1732 continue; 1733 /* 1734 * If a previous segment led to a search using 1735 * the same free lists as would this segment, then 1736 * we've actually already searched within this 1737 * too. So skip it. 1738 */ 1739 if (seg->free_queues == queues) 1740 continue; 1741 queues = seg->free_queues; 1742 m_run = vm_phys_find_queues_contig(queues, npages, 1743 low, high, alignment, boundary); 1744 if (m_run != NULL) 1745 break; 1746 } 1747 if (m_run == NULL) 1748 return (NULL); 1749 1750 /* Allocate pages from the page-range found. */ 1751 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { 1752 fl = (*queues)[m->pool]; 1753 oind = m->order; 1754 vm_freelist_rem(fl, m, oind); 1755 vm_phys_finish_init(m, oind); 1756 } 1757 /* Return excess pages to the free lists. */ 1758 fl = (*queues)[VM_FREEPOOL_DEFAULT]; 1759 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 1760 VM_FREEPOOL_DEFAULT, 0); 1761 1762 /* Return page verified to satisfy conditions of request. */ 1763 pa_start = VM_PAGE_TO_PHYS(m_run); 1764 KASSERT(low <= pa_start, 1765 ("memory allocated below minimum requested range")); 1766 KASSERT(pa_start + ptoa(npages) <= high, 1767 ("memory allocated above maximum requested range")); 1768 seg = &vm_phys_segs[m_run->segind]; 1769 KASSERT(seg->domain == domain, 1770 ("memory not allocated from specified domain")); 1771 KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary), 1772 ("memory alignment/boundary constraints not satisfied")); 1773 return (m_run); 1774 } 1775 1776 /* 1777 * Return the index of the first unused slot which may be the terminating 1778 * entry. 1779 */ 1780 static int 1781 vm_phys_avail_count(void) 1782 { 1783 int i; 1784 1785 for (i = 0; i < PHYS_AVAIL_COUNT; i += 2) 1786 if (phys_avail[i] == 0 && phys_avail[i + 1] == 0) 1787 return (i); 1788 panic("Improperly terminated phys_avail[]"); 1789 } 1790 1791 /* 1792 * Assert that a phys_avail entry is valid. 1793 */ 1794 static void 1795 vm_phys_avail_check(int i) 1796 { 1797 if (i % 2 != 0) 1798 panic("Chunk start index %d is not even.", i); 1799 if (phys_avail[i] & PAGE_MASK) 1800 panic("Unaligned phys_avail[%d]: %#jx", i, 1801 (intmax_t)phys_avail[i]); 1802 if (phys_avail[i + 1] & PAGE_MASK) 1803 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1804 (intmax_t)phys_avail[i + 1]); 1805 if (phys_avail[i + 1] < phys_avail[i]) 1806 panic("phys_avail[%d]: start %#jx > end %#jx", i, 1807 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i + 1]); 1808 } 1809 1810 /* 1811 * Return the index of an overlapping phys_avail entry or -1. 1812 */ 1813 #ifdef NUMA 1814 static int 1815 vm_phys_avail_find(vm_paddr_t pa) 1816 { 1817 int i; 1818 1819 for (i = 0; phys_avail[i + 1]; i += 2) 1820 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1821 return (i); 1822 return (-1); 1823 } 1824 #endif 1825 1826 /* 1827 * Return the index of the largest entry. 1828 */ 1829 int 1830 vm_phys_avail_largest(void) 1831 { 1832 vm_paddr_t sz, largesz; 1833 int largest; 1834 int i; 1835 1836 largest = 0; 1837 largesz = 0; 1838 for (i = 0; phys_avail[i + 1]; i += 2) { 1839 sz = vm_phys_avail_size(i); 1840 if (sz > largesz) { 1841 largesz = sz; 1842 largest = i; 1843 } 1844 } 1845 1846 return (largest); 1847 } 1848 1849 vm_paddr_t 1850 vm_phys_avail_size(int i) 1851 { 1852 1853 return (phys_avail[i + 1] - phys_avail[i]); 1854 } 1855 1856 /* 1857 * Split a chunk in phys_avail[] at the address 'pa'. 1858 * 1859 * 'pa' must be within a chunk (slots i and i + 1) or one of its boundaries. 1860 * Returns zero on actual split, in which case the two new chunks occupy slots 1861 * i to i + 3, else EJUSTRETURN if 'pa' was one of the boundaries (and no split 1862 * actually occurred) else ENOSPC if there are not enough slots in phys_avail[] 1863 * to represent the additional chunk caused by the split. 1864 */ 1865 static int 1866 vm_phys_avail_split(vm_paddr_t pa, int i) 1867 { 1868 int cnt; 1869 1870 vm_phys_avail_check(i); 1871 if (pa < phys_avail[i] || pa > phys_avail[i + 1]) 1872 panic("%s: Address %#jx not in range at slot %d [%#jx;%#jx].", 1873 __func__, (uintmax_t)pa, i, 1874 (uintmax_t)phys_avail[i], (uintmax_t)phys_avail[i + 1]); 1875 if (pa == phys_avail[i] || pa == phys_avail[i + 1]) 1876 return (EJUSTRETURN); 1877 cnt = vm_phys_avail_count(); 1878 if (cnt >= PHYS_AVAIL_ENTRIES) 1879 return (ENOSPC); 1880 memmove(&phys_avail[i + 2], &phys_avail[i], 1881 (cnt - i) * sizeof(phys_avail[0])); 1882 phys_avail[i + 1] = pa; 1883 phys_avail[i + 2] = pa; 1884 vm_phys_avail_check(i); 1885 vm_phys_avail_check(i+2); 1886 1887 return (0); 1888 } 1889 1890 /* 1891 * Check if a given physical address can be included as part of a crash dump. 1892 */ 1893 bool 1894 vm_phys_is_dumpable(vm_paddr_t pa) 1895 { 1896 vm_page_t m; 1897 int i; 1898 1899 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1900 return ((m->flags & PG_NODUMP) == 0); 1901 1902 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1903 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1904 return (true); 1905 } 1906 return (false); 1907 } 1908 1909 void 1910 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1911 { 1912 struct vm_phys_seg *seg; 1913 1914 if (vm_phys_early_nsegs == -1) 1915 panic("%s: called after initialization", __func__); 1916 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1917 panic("%s: ran out of early segments", __func__); 1918 1919 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1920 seg->start = start; 1921 seg->end = end; 1922 } 1923 1924 /* 1925 * This routine allocates NUMA node specific memory before the page 1926 * allocator is bootstrapped. 1927 */ 1928 vm_paddr_t 1929 vm_phys_early_alloc(int domain, size_t alloc_size) 1930 { 1931 #ifdef NUMA 1932 int mem_index; 1933 #endif 1934 int i, biggestone; 1935 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1936 1937 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1938 ("%s: invalid domain index %d", __func__, domain)); 1939 1940 /* 1941 * Search the mem_affinity array for the biggest address 1942 * range in the desired domain. This is used to constrain 1943 * the phys_avail selection below. 1944 */ 1945 biggestsize = 0; 1946 mem_start = 0; 1947 mem_end = -1; 1948 #ifdef NUMA 1949 mem_index = 0; 1950 if (mem_affinity != NULL) { 1951 for (i = 0;; i++) { 1952 size = mem_affinity[i].end - mem_affinity[i].start; 1953 if (size == 0) 1954 break; 1955 if (domain != -1 && mem_affinity[i].domain != domain) 1956 continue; 1957 if (size > biggestsize) { 1958 mem_index = i; 1959 biggestsize = size; 1960 } 1961 } 1962 mem_start = mem_affinity[mem_index].start; 1963 mem_end = mem_affinity[mem_index].end; 1964 } 1965 #endif 1966 1967 /* 1968 * Now find biggest physical segment in within the desired 1969 * numa domain. 1970 */ 1971 biggestsize = 0; 1972 biggestone = 0; 1973 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1974 /* skip regions that are out of range */ 1975 if (phys_avail[i+1] - alloc_size < mem_start || 1976 phys_avail[i+1] > mem_end) 1977 continue; 1978 size = vm_phys_avail_size(i); 1979 if (size > biggestsize) { 1980 biggestone = i; 1981 biggestsize = size; 1982 } 1983 } 1984 alloc_size = round_page(alloc_size); 1985 1986 /* 1987 * Grab single pages from the front to reduce fragmentation. 1988 */ 1989 if (alloc_size == PAGE_SIZE) { 1990 pa = phys_avail[biggestone]; 1991 phys_avail[biggestone] += PAGE_SIZE; 1992 vm_phys_avail_check(biggestone); 1993 return (pa); 1994 } 1995 1996 /* 1997 * Naturally align large allocations. 1998 */ 1999 align = phys_avail[biggestone + 1] & (alloc_size - 1); 2000 if (alloc_size + align > biggestsize) 2001 panic("cannot find a large enough size\n"); 2002 if (align != 0 && 2003 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 2004 biggestone) != 0) 2005 /* Wasting memory. */ 2006 phys_avail[biggestone + 1] -= align; 2007 2008 phys_avail[biggestone + 1] -= alloc_size; 2009 vm_phys_avail_check(biggestone); 2010 pa = phys_avail[biggestone + 1]; 2011 return (pa); 2012 } 2013 2014 void 2015 vm_phys_early_startup(void) 2016 { 2017 struct vm_phys_seg *seg; 2018 int i; 2019 2020 if (phys_avail[1] == 0) 2021 panic("phys_avail[] is empty"); 2022 2023 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 2024 phys_avail[i] = round_page(phys_avail[i]); 2025 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 2026 } 2027 2028 for (i = 0; i < vm_phys_early_nsegs; i++) { 2029 seg = &vm_phys_early_segs[i]; 2030 vm_phys_add_seg(seg->start, seg->end); 2031 } 2032 vm_phys_early_nsegs = -1; 2033 2034 #ifdef NUMA 2035 /* Force phys_avail to be split by domain. */ 2036 if (mem_affinity != NULL) { 2037 int idx; 2038 2039 for (i = 0; mem_affinity[i].end != 0; i++) { 2040 idx = vm_phys_avail_find(mem_affinity[i].start); 2041 if (idx != -1) 2042 vm_phys_avail_split(mem_affinity[i].start, idx); 2043 idx = vm_phys_avail_find(mem_affinity[i].end); 2044 if (idx != -1) 2045 vm_phys_avail_split(mem_affinity[i].end, idx); 2046 } 2047 } 2048 #endif 2049 } 2050 2051 #ifdef DDB 2052 /* 2053 * Show the number of physical pages in each of the free lists. 2054 */ 2055 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) 2056 { 2057 struct vm_freelist *fl; 2058 int flind, oind, pind, dom; 2059 2060 for (dom = 0; dom < vm_ndomains; dom++) { 2061 db_printf("DOMAIN: %d\n", dom); 2062 for (flind = 0; flind < vm_nfreelists; flind++) { 2063 db_printf("FREE LIST %d:\n" 2064 "\n ORDER (SIZE) | NUMBER" 2065 "\n ", flind); 2066 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2067 db_printf(" | POOL %d", pind); 2068 db_printf("\n-- "); 2069 for (pind = 0; pind < VM_NFREEPOOL; pind++) 2070 db_printf("-- -- "); 2071 db_printf("--\n"); 2072 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 2073 db_printf(" %2.2d (%6.6dK)", oind, 2074 1 << (PAGE_SHIFT - 10 + oind)); 2075 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 2076 fl = vm_phys_free_queues[dom][flind][pind]; 2077 db_printf(" | %6.6d", fl[oind].lcnt); 2078 } 2079 db_printf("\n"); 2080 } 2081 db_printf("\n"); 2082 } 2083 db_printf("\n"); 2084 } 2085 } 2086 #endif 2087