1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ddb.h" 45 #include "opt_vm.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/domainset.h> 50 #include <sys/lock.h> 51 #include <sys/kernel.h> 52 #include <sys/malloc.h> 53 #include <sys/mutex.h> 54 #include <sys/proc.h> 55 #include <sys/queue.h> 56 #include <sys/rwlock.h> 57 #include <sys/sbuf.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/vmmeter.h> 61 62 #include <ddb/ddb.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_extern.h> 66 #include <vm/vm_param.h> 67 #include <vm/vm_kern.h> 68 #include <vm/vm_object.h> 69 #include <vm/vm_page.h> 70 #include <vm/vm_phys.h> 71 #include <vm/vm_pagequeue.h> 72 73 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 74 "Too many physsegs."); 75 76 #ifdef NUMA 77 struct mem_affinity __read_mostly *mem_affinity; 78 int __read_mostly *mem_locality; 79 #endif 80 81 int __read_mostly vm_ndomains = 1; 82 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 83 84 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 85 int __read_mostly vm_phys_nsegs; 86 static struct vm_phys_seg vm_phys_early_segs[8]; 87 static int vm_phys_early_nsegs; 88 89 struct vm_phys_fictitious_seg; 90 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 91 struct vm_phys_fictitious_seg *); 92 93 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 94 RB_INITIALIZER(&vm_phys_fictitious_tree); 95 96 struct vm_phys_fictitious_seg { 97 RB_ENTRY(vm_phys_fictitious_seg) node; 98 /* Memory region data */ 99 vm_paddr_t start; 100 vm_paddr_t end; 101 vm_page_t first_page; 102 }; 103 104 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 105 vm_phys_fictitious_cmp); 106 107 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 108 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 109 110 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 111 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 112 [VM_NFREEORDER_MAX]; 113 114 static int __read_mostly vm_nfreelists; 115 116 /* 117 * These "avail lists" are globals used to communicate boot-time physical 118 * memory layout to other parts of the kernel. Each physically contiguous 119 * region of memory is defined by a start address at an even index and an 120 * end address at the following odd index. Each list is terminated by a 121 * pair of zero entries. 122 * 123 * dump_avail tells the dump code what regions to include in a crash dump, and 124 * phys_avail is all of the remaining physical memory that is available for 125 * the vm system. 126 * 127 * Initially dump_avail and phys_avail are identical. Boot time memory 128 * allocations remove extents from phys_avail that may still be included 129 * in dumps. 130 */ 131 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 132 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 133 134 /* 135 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 136 */ 137 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 138 139 CTASSERT(VM_FREELIST_DEFAULT == 0); 140 141 #ifdef VM_FREELIST_DMA32 142 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 143 #endif 144 145 /* 146 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 147 * the ordering of the free list boundaries. 148 */ 149 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 150 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 151 #endif 152 153 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 154 SYSCTL_OID(_vm, OID_AUTO, phys_free, 155 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 156 sysctl_vm_phys_free, "A", 157 "Phys Free Info"); 158 159 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 160 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 161 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 162 sysctl_vm_phys_segs, "A", 163 "Phys Seg Info"); 164 165 #ifdef NUMA 166 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 167 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 168 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 169 sysctl_vm_phys_locality, "A", 170 "Phys Locality Info"); 171 #endif 172 173 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 174 &vm_ndomains, 0, "Number of physical memory domains available."); 175 176 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, 177 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 178 vm_paddr_t boundary); 179 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 180 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 181 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 182 int order, int tail); 183 184 /* 185 * Red-black tree helpers for vm fictitious range management. 186 */ 187 static inline int 188 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 189 struct vm_phys_fictitious_seg *range) 190 { 191 192 KASSERT(range->start != 0 && range->end != 0, 193 ("Invalid range passed on search for vm_fictitious page")); 194 if (p->start >= range->end) 195 return (1); 196 if (p->start < range->start) 197 return (-1); 198 199 return (0); 200 } 201 202 static int 203 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 204 struct vm_phys_fictitious_seg *p2) 205 { 206 207 /* Check if this is a search for a page */ 208 if (p1->end == 0) 209 return (vm_phys_fictitious_in_range(p1, p2)); 210 211 KASSERT(p2->end != 0, 212 ("Invalid range passed as second parameter to vm fictitious comparison")); 213 214 /* Searching to add a new range */ 215 if (p1->end <= p2->start) 216 return (-1); 217 if (p1->start >= p2->end) 218 return (1); 219 220 panic("Trying to add overlapping vm fictitious ranges:\n" 221 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 222 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 223 } 224 225 int 226 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high) 227 { 228 #ifdef NUMA 229 domainset_t mask; 230 int i; 231 232 if (vm_ndomains == 1 || mem_affinity == NULL) 233 return (0); 234 235 DOMAINSET_ZERO(&mask); 236 /* 237 * Check for any memory that overlaps low, high. 238 */ 239 for (i = 0; mem_affinity[i].end != 0; i++) 240 if (mem_affinity[i].start <= high && 241 mem_affinity[i].end >= low) 242 DOMAINSET_SET(mem_affinity[i].domain, &mask); 243 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 244 return (prefer); 245 if (DOMAINSET_EMPTY(&mask)) 246 panic("vm_phys_domain_match: Impossible constraint"); 247 return (DOMAINSET_FFS(&mask) - 1); 248 #else 249 return (0); 250 #endif 251 } 252 253 /* 254 * Outputs the state of the physical memory allocator, specifically, 255 * the amount of physical memory in each free list. 256 */ 257 static int 258 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 259 { 260 struct sbuf sbuf; 261 struct vm_freelist *fl; 262 int dom, error, flind, oind, pind; 263 264 error = sysctl_wire_old_buffer(req, 0); 265 if (error != 0) 266 return (error); 267 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 268 for (dom = 0; dom < vm_ndomains; dom++) { 269 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 270 for (flind = 0; flind < vm_nfreelists; flind++) { 271 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 272 "\n ORDER (SIZE) | NUMBER" 273 "\n ", flind); 274 for (pind = 0; pind < VM_NFREEPOOL; pind++) 275 sbuf_printf(&sbuf, " | POOL %d", pind); 276 sbuf_printf(&sbuf, "\n-- "); 277 for (pind = 0; pind < VM_NFREEPOOL; pind++) 278 sbuf_printf(&sbuf, "-- -- "); 279 sbuf_printf(&sbuf, "--\n"); 280 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 281 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 282 1 << (PAGE_SHIFT - 10 + oind)); 283 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 284 fl = vm_phys_free_queues[dom][flind][pind]; 285 sbuf_printf(&sbuf, " | %6d", 286 fl[oind].lcnt); 287 } 288 sbuf_printf(&sbuf, "\n"); 289 } 290 } 291 } 292 error = sbuf_finish(&sbuf); 293 sbuf_delete(&sbuf); 294 return (error); 295 } 296 297 /* 298 * Outputs the set of physical memory segments. 299 */ 300 static int 301 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 302 { 303 struct sbuf sbuf; 304 struct vm_phys_seg *seg; 305 int error, segind; 306 307 error = sysctl_wire_old_buffer(req, 0); 308 if (error != 0) 309 return (error); 310 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 311 for (segind = 0; segind < vm_phys_nsegs; segind++) { 312 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 313 seg = &vm_phys_segs[segind]; 314 sbuf_printf(&sbuf, "start: %#jx\n", 315 (uintmax_t)seg->start); 316 sbuf_printf(&sbuf, "end: %#jx\n", 317 (uintmax_t)seg->end); 318 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 319 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 320 } 321 error = sbuf_finish(&sbuf); 322 sbuf_delete(&sbuf); 323 return (error); 324 } 325 326 /* 327 * Return affinity, or -1 if there's no affinity information. 328 */ 329 int 330 vm_phys_mem_affinity(int f, int t) 331 { 332 333 #ifdef NUMA 334 if (mem_locality == NULL) 335 return (-1); 336 if (f >= vm_ndomains || t >= vm_ndomains) 337 return (-1); 338 return (mem_locality[f * vm_ndomains + t]); 339 #else 340 return (-1); 341 #endif 342 } 343 344 #ifdef NUMA 345 /* 346 * Outputs the VM locality table. 347 */ 348 static int 349 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 350 { 351 struct sbuf sbuf; 352 int error, i, j; 353 354 error = sysctl_wire_old_buffer(req, 0); 355 if (error != 0) 356 return (error); 357 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 358 359 sbuf_printf(&sbuf, "\n"); 360 361 for (i = 0; i < vm_ndomains; i++) { 362 sbuf_printf(&sbuf, "%d: ", i); 363 for (j = 0; j < vm_ndomains; j++) { 364 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 365 } 366 sbuf_printf(&sbuf, "\n"); 367 } 368 error = sbuf_finish(&sbuf); 369 sbuf_delete(&sbuf); 370 return (error); 371 } 372 #endif 373 374 static void 375 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 376 { 377 378 m->order = order; 379 if (tail) 380 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 381 else 382 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 383 fl[order].lcnt++; 384 } 385 386 static void 387 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 388 { 389 390 TAILQ_REMOVE(&fl[order].pl, m, listq); 391 fl[order].lcnt--; 392 m->order = VM_NFREEORDER; 393 } 394 395 /* 396 * Create a physical memory segment. 397 */ 398 static void 399 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 400 { 401 struct vm_phys_seg *seg; 402 403 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 404 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 405 KASSERT(domain >= 0 && domain < vm_ndomains, 406 ("vm_phys_create_seg: invalid domain provided")); 407 seg = &vm_phys_segs[vm_phys_nsegs++]; 408 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 409 *seg = *(seg - 1); 410 seg--; 411 } 412 seg->start = start; 413 seg->end = end; 414 seg->domain = domain; 415 } 416 417 static void 418 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 419 { 420 #ifdef NUMA 421 int i; 422 423 if (mem_affinity == NULL) { 424 _vm_phys_create_seg(start, end, 0); 425 return; 426 } 427 428 for (i = 0;; i++) { 429 if (mem_affinity[i].end == 0) 430 panic("Reached end of affinity info"); 431 if (mem_affinity[i].end <= start) 432 continue; 433 if (mem_affinity[i].start > start) 434 panic("No affinity info for start %jx", 435 (uintmax_t)start); 436 if (mem_affinity[i].end >= end) { 437 _vm_phys_create_seg(start, end, 438 mem_affinity[i].domain); 439 break; 440 } 441 _vm_phys_create_seg(start, mem_affinity[i].end, 442 mem_affinity[i].domain); 443 start = mem_affinity[i].end; 444 } 445 #else 446 _vm_phys_create_seg(start, end, 0); 447 #endif 448 } 449 450 /* 451 * Add a physical memory segment. 452 */ 453 void 454 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 455 { 456 vm_paddr_t paddr; 457 458 KASSERT((start & PAGE_MASK) == 0, 459 ("vm_phys_define_seg: start is not page aligned")); 460 KASSERT((end & PAGE_MASK) == 0, 461 ("vm_phys_define_seg: end is not page aligned")); 462 463 /* 464 * Split the physical memory segment if it spans two or more free 465 * list boundaries. 466 */ 467 paddr = start; 468 #ifdef VM_FREELIST_LOWMEM 469 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 470 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 471 paddr = VM_LOWMEM_BOUNDARY; 472 } 473 #endif 474 #ifdef VM_FREELIST_DMA32 475 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 476 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 477 paddr = VM_DMA32_BOUNDARY; 478 } 479 #endif 480 vm_phys_create_seg(paddr, end); 481 } 482 483 /* 484 * Initialize the physical memory allocator. 485 * 486 * Requires that vm_page_array is initialized! 487 */ 488 void 489 vm_phys_init(void) 490 { 491 struct vm_freelist *fl; 492 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 493 u_long npages; 494 int dom, flind, freelist, oind, pind, segind; 495 496 /* 497 * Compute the number of free lists, and generate the mapping from the 498 * manifest constants VM_FREELIST_* to the free list indices. 499 * 500 * Initially, the entries of vm_freelist_to_flind[] are set to either 501 * 0 or 1 to indicate which free lists should be created. 502 */ 503 npages = 0; 504 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 505 seg = &vm_phys_segs[segind]; 506 #ifdef VM_FREELIST_LOWMEM 507 if (seg->end <= VM_LOWMEM_BOUNDARY) 508 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 509 else 510 #endif 511 #ifdef VM_FREELIST_DMA32 512 if ( 513 #ifdef VM_DMA32_NPAGES_THRESHOLD 514 /* 515 * Create the DMA32 free list only if the amount of 516 * physical memory above physical address 4G exceeds the 517 * given threshold. 518 */ 519 npages > VM_DMA32_NPAGES_THRESHOLD && 520 #endif 521 seg->end <= VM_DMA32_BOUNDARY) 522 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 523 else 524 #endif 525 { 526 npages += atop(seg->end - seg->start); 527 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 528 } 529 } 530 /* Change each entry into a running total of the free lists. */ 531 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 532 vm_freelist_to_flind[freelist] += 533 vm_freelist_to_flind[freelist - 1]; 534 } 535 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 536 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 537 /* Change each entry into a free list index. */ 538 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 539 vm_freelist_to_flind[freelist]--; 540 541 /* 542 * Initialize the first_page and free_queues fields of each physical 543 * memory segment. 544 */ 545 #ifdef VM_PHYSSEG_SPARSE 546 npages = 0; 547 #endif 548 for (segind = 0; segind < vm_phys_nsegs; segind++) { 549 seg = &vm_phys_segs[segind]; 550 #ifdef VM_PHYSSEG_SPARSE 551 seg->first_page = &vm_page_array[npages]; 552 npages += atop(seg->end - seg->start); 553 #else 554 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 555 #endif 556 #ifdef VM_FREELIST_LOWMEM 557 if (seg->end <= VM_LOWMEM_BOUNDARY) { 558 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 559 KASSERT(flind >= 0, 560 ("vm_phys_init: LOWMEM flind < 0")); 561 } else 562 #endif 563 #ifdef VM_FREELIST_DMA32 564 if (seg->end <= VM_DMA32_BOUNDARY) { 565 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 566 KASSERT(flind >= 0, 567 ("vm_phys_init: DMA32 flind < 0")); 568 } else 569 #endif 570 { 571 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 572 KASSERT(flind >= 0, 573 ("vm_phys_init: DEFAULT flind < 0")); 574 } 575 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 576 } 577 578 /* 579 * Coalesce physical memory segments that are contiguous and share the 580 * same per-domain free queues. 581 */ 582 prev_seg = vm_phys_segs; 583 seg = &vm_phys_segs[1]; 584 end_seg = &vm_phys_segs[vm_phys_nsegs]; 585 while (seg < end_seg) { 586 if (prev_seg->end == seg->start && 587 prev_seg->free_queues == seg->free_queues) { 588 prev_seg->end = seg->end; 589 KASSERT(prev_seg->domain == seg->domain, 590 ("vm_phys_init: free queues cannot span domains")); 591 vm_phys_nsegs--; 592 end_seg--; 593 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 594 *tmp_seg = *(tmp_seg + 1); 595 } else { 596 prev_seg = seg; 597 seg++; 598 } 599 } 600 601 /* 602 * Initialize the free queues. 603 */ 604 for (dom = 0; dom < vm_ndomains; dom++) { 605 for (flind = 0; flind < vm_nfreelists; flind++) { 606 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 607 fl = vm_phys_free_queues[dom][flind][pind]; 608 for (oind = 0; oind < VM_NFREEORDER; oind++) 609 TAILQ_INIT(&fl[oind].pl); 610 } 611 } 612 } 613 614 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 615 } 616 617 /* 618 * Register info about the NUMA topology of the system. 619 * 620 * Invoked by platform-dependent code prior to vm_phys_init(). 621 */ 622 void 623 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, 624 int *locality) 625 { 626 #ifdef NUMA 627 int d, i; 628 629 /* 630 * For now the only override value that we support is 1, which 631 * effectively disables NUMA-awareness in the allocators. 632 */ 633 d = 0; 634 TUNABLE_INT_FETCH("vm.numa.disabled", &d); 635 if (d) 636 ndomains = 1; 637 638 if (ndomains > 1) { 639 vm_ndomains = ndomains; 640 mem_affinity = affinity; 641 mem_locality = locality; 642 } 643 644 for (i = 0; i < vm_ndomains; i++) 645 DOMAINSET_SET(i, &all_domains); 646 #else 647 (void)ndomains; 648 (void)affinity; 649 (void)locality; 650 #endif 651 } 652 653 /* 654 * Split a contiguous, power of two-sized set of physical pages. 655 * 656 * When this function is called by a page allocation function, the caller 657 * should request insertion at the head unless the order [order, oind) queues 658 * are known to be empty. The objective being to reduce the likelihood of 659 * long-term fragmentation by promoting contemporaneous allocation and 660 * (hopefully) deallocation. 661 */ 662 static __inline void 663 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 664 int tail) 665 { 666 vm_page_t m_buddy; 667 668 while (oind > order) { 669 oind--; 670 m_buddy = &m[1 << oind]; 671 KASSERT(m_buddy->order == VM_NFREEORDER, 672 ("vm_phys_split_pages: page %p has unexpected order %d", 673 m_buddy, m_buddy->order)); 674 vm_freelist_add(fl, m_buddy, oind, tail); 675 } 676 } 677 678 /* 679 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 680 * and sized set to the specified free list. 681 * 682 * When this function is called by a page allocation function, the caller 683 * should request insertion at the head unless the lower-order queues are 684 * known to be empty. The objective being to reduce the likelihood of long- 685 * term fragmentation by promoting contemporaneous allocation and (hopefully) 686 * deallocation. 687 * 688 * The physical page m's buddy must not be free. 689 */ 690 static void 691 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 692 { 693 u_int n; 694 int order; 695 696 KASSERT(npages > 0, ("vm_phys_enq_range: npages is 0")); 697 KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 698 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 699 ("vm_phys_enq_range: page %p and npages %u are misaligned", 700 m, npages)); 701 do { 702 KASSERT(m->order == VM_NFREEORDER, 703 ("vm_phys_enq_range: page %p has unexpected order %d", 704 m, m->order)); 705 order = ffs(npages) - 1; 706 KASSERT(order < VM_NFREEORDER, 707 ("vm_phys_enq_range: order %d is out of range", order)); 708 vm_freelist_add(fl, m, order, tail); 709 n = 1 << order; 710 m += n; 711 npages -= n; 712 } while (npages > 0); 713 } 714 715 /* 716 * Set the pool for a contiguous, power of two-sized set of physical pages. 717 */ 718 static void 719 vm_phys_set_pool(int pool, vm_page_t m, int order) 720 { 721 vm_page_t m_tmp; 722 723 for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 724 m_tmp->pool = pool; 725 } 726 727 /* 728 * Tries to allocate the specified number of pages from the specified pool 729 * within the specified domain. Returns the actual number of allocated pages 730 * and a pointer to each page through the array ma[]. 731 * 732 * The returned pages may not be physically contiguous. However, in contrast 733 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 734 * calling this function once to allocate the desired number of pages will 735 * avoid wasted time in vm_phys_split_pages(). 736 * 737 * The free page queues for the specified domain must be locked. 738 */ 739 int 740 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 741 { 742 struct vm_freelist *alt, *fl; 743 vm_page_t m; 744 int avail, end, flind, freelist, i, need, oind, pind; 745 746 KASSERT(domain >= 0 && domain < vm_ndomains, 747 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 748 KASSERT(pool < VM_NFREEPOOL, 749 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 750 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 751 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 752 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 753 i = 0; 754 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 755 flind = vm_freelist_to_flind[freelist]; 756 if (flind < 0) 757 continue; 758 fl = vm_phys_free_queues[domain][flind][pool]; 759 for (oind = 0; oind < VM_NFREEORDER; oind++) { 760 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 761 vm_freelist_rem(fl, m, oind); 762 avail = 1 << oind; 763 need = imin(npages - i, avail); 764 for (end = i + need; i < end;) 765 ma[i++] = m++; 766 if (need < avail) { 767 /* 768 * Return excess pages to fl. Its 769 * order [0, oind) queues are empty. 770 */ 771 vm_phys_enq_range(m, avail - need, fl, 772 1); 773 return (npages); 774 } else if (i == npages) 775 return (npages); 776 } 777 } 778 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 779 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 780 alt = vm_phys_free_queues[domain][flind][pind]; 781 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 782 NULL) { 783 vm_freelist_rem(alt, m, oind); 784 vm_phys_set_pool(pool, m, oind); 785 avail = 1 << oind; 786 need = imin(npages - i, avail); 787 for (end = i + need; i < end;) 788 ma[i++] = m++; 789 if (need < avail) { 790 /* 791 * Return excess pages to fl. 792 * Its order [0, oind) queues 793 * are empty. 794 */ 795 vm_phys_enq_range(m, avail - 796 need, fl, 1); 797 return (npages); 798 } else if (i == npages) 799 return (npages); 800 } 801 } 802 } 803 } 804 return (i); 805 } 806 807 /* 808 * Allocate a contiguous, power of two-sized set of physical pages 809 * from the free lists. 810 * 811 * The free page queues must be locked. 812 */ 813 vm_page_t 814 vm_phys_alloc_pages(int domain, int pool, int order) 815 { 816 vm_page_t m; 817 int freelist; 818 819 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 820 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 821 if (m != NULL) 822 return (m); 823 } 824 return (NULL); 825 } 826 827 /* 828 * Allocate a contiguous, power of two-sized set of physical pages from the 829 * specified free list. The free list must be specified using one of the 830 * manifest constants VM_FREELIST_*. 831 * 832 * The free page queues must be locked. 833 */ 834 vm_page_t 835 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 836 { 837 struct vm_freelist *alt, *fl; 838 vm_page_t m; 839 int oind, pind, flind; 840 841 KASSERT(domain >= 0 && domain < vm_ndomains, 842 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 843 domain)); 844 KASSERT(freelist < VM_NFREELIST, 845 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 846 freelist)); 847 KASSERT(pool < VM_NFREEPOOL, 848 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 849 KASSERT(order < VM_NFREEORDER, 850 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 851 852 flind = vm_freelist_to_flind[freelist]; 853 /* Check if freelist is present */ 854 if (flind < 0) 855 return (NULL); 856 857 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 858 fl = &vm_phys_free_queues[domain][flind][pool][0]; 859 for (oind = order; oind < VM_NFREEORDER; oind++) { 860 m = TAILQ_FIRST(&fl[oind].pl); 861 if (m != NULL) { 862 vm_freelist_rem(fl, m, oind); 863 /* The order [order, oind) queues are empty. */ 864 vm_phys_split_pages(m, oind, fl, order, 1); 865 return (m); 866 } 867 } 868 869 /* 870 * The given pool was empty. Find the largest 871 * contiguous, power-of-two-sized set of pages in any 872 * pool. Transfer these pages to the given pool, and 873 * use them to satisfy the allocation. 874 */ 875 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 876 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 877 alt = &vm_phys_free_queues[domain][flind][pind][0]; 878 m = TAILQ_FIRST(&alt[oind].pl); 879 if (m != NULL) { 880 vm_freelist_rem(alt, m, oind); 881 vm_phys_set_pool(pool, m, oind); 882 /* The order [order, oind) queues are empty. */ 883 vm_phys_split_pages(m, oind, fl, order, 1); 884 return (m); 885 } 886 } 887 } 888 return (NULL); 889 } 890 891 /* 892 * Find the vm_page corresponding to the given physical address. 893 */ 894 vm_page_t 895 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 896 { 897 struct vm_phys_seg *seg; 898 int segind; 899 900 for (segind = 0; segind < vm_phys_nsegs; segind++) { 901 seg = &vm_phys_segs[segind]; 902 if (pa >= seg->start && pa < seg->end) 903 return (&seg->first_page[atop(pa - seg->start)]); 904 } 905 return (NULL); 906 } 907 908 vm_page_t 909 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 910 { 911 struct vm_phys_fictitious_seg tmp, *seg; 912 vm_page_t m; 913 914 m = NULL; 915 tmp.start = pa; 916 tmp.end = 0; 917 918 rw_rlock(&vm_phys_fictitious_reg_lock); 919 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 920 rw_runlock(&vm_phys_fictitious_reg_lock); 921 if (seg == NULL) 922 return (NULL); 923 924 m = &seg->first_page[atop(pa - seg->start)]; 925 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 926 927 return (m); 928 } 929 930 static inline void 931 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 932 long page_count, vm_memattr_t memattr) 933 { 934 long i; 935 936 bzero(range, page_count * sizeof(*range)); 937 for (i = 0; i < page_count; i++) { 938 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 939 range[i].oflags &= ~VPO_UNMANAGED; 940 range[i].busy_lock = VPB_UNBUSIED; 941 } 942 } 943 944 int 945 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 946 vm_memattr_t memattr) 947 { 948 struct vm_phys_fictitious_seg *seg; 949 vm_page_t fp; 950 long page_count; 951 #ifdef VM_PHYSSEG_DENSE 952 long pi, pe; 953 long dpage_count; 954 #endif 955 956 KASSERT(start < end, 957 ("Start of segment isn't less than end (start: %jx end: %jx)", 958 (uintmax_t)start, (uintmax_t)end)); 959 960 page_count = (end - start) / PAGE_SIZE; 961 962 #ifdef VM_PHYSSEG_DENSE 963 pi = atop(start); 964 pe = atop(end); 965 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 966 fp = &vm_page_array[pi - first_page]; 967 if ((pe - first_page) > vm_page_array_size) { 968 /* 969 * We have a segment that starts inside 970 * of vm_page_array, but ends outside of it. 971 * 972 * Use vm_page_array pages for those that are 973 * inside of the vm_page_array range, and 974 * allocate the remaining ones. 975 */ 976 dpage_count = vm_page_array_size - (pi - first_page); 977 vm_phys_fictitious_init_range(fp, start, dpage_count, 978 memattr); 979 page_count -= dpage_count; 980 start += ptoa(dpage_count); 981 goto alloc; 982 } 983 /* 984 * We can allocate the full range from vm_page_array, 985 * so there's no need to register the range in the tree. 986 */ 987 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 988 return (0); 989 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 990 /* 991 * We have a segment that ends inside of vm_page_array, 992 * but starts outside of it. 993 */ 994 fp = &vm_page_array[0]; 995 dpage_count = pe - first_page; 996 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 997 memattr); 998 end -= ptoa(dpage_count); 999 page_count -= dpage_count; 1000 goto alloc; 1001 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1002 /* 1003 * Trying to register a fictitious range that expands before 1004 * and after vm_page_array. 1005 */ 1006 return (EINVAL); 1007 } else { 1008 alloc: 1009 #endif 1010 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1011 M_WAITOK); 1012 #ifdef VM_PHYSSEG_DENSE 1013 } 1014 #endif 1015 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1016 1017 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1018 seg->start = start; 1019 seg->end = end; 1020 seg->first_page = fp; 1021 1022 rw_wlock(&vm_phys_fictitious_reg_lock); 1023 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1024 rw_wunlock(&vm_phys_fictitious_reg_lock); 1025 1026 return (0); 1027 } 1028 1029 void 1030 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1031 { 1032 struct vm_phys_fictitious_seg *seg, tmp; 1033 #ifdef VM_PHYSSEG_DENSE 1034 long pi, pe; 1035 #endif 1036 1037 KASSERT(start < end, 1038 ("Start of segment isn't less than end (start: %jx end: %jx)", 1039 (uintmax_t)start, (uintmax_t)end)); 1040 1041 #ifdef VM_PHYSSEG_DENSE 1042 pi = atop(start); 1043 pe = atop(end); 1044 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1045 if ((pe - first_page) <= vm_page_array_size) { 1046 /* 1047 * This segment was allocated using vm_page_array 1048 * only, there's nothing to do since those pages 1049 * were never added to the tree. 1050 */ 1051 return; 1052 } 1053 /* 1054 * We have a segment that starts inside 1055 * of vm_page_array, but ends outside of it. 1056 * 1057 * Calculate how many pages were added to the 1058 * tree and free them. 1059 */ 1060 start = ptoa(first_page + vm_page_array_size); 1061 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1062 /* 1063 * We have a segment that ends inside of vm_page_array, 1064 * but starts outside of it. 1065 */ 1066 end = ptoa(first_page); 1067 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1068 /* Since it's not possible to register such a range, panic. */ 1069 panic( 1070 "Unregistering not registered fictitious range [%#jx:%#jx]", 1071 (uintmax_t)start, (uintmax_t)end); 1072 } 1073 #endif 1074 tmp.start = start; 1075 tmp.end = 0; 1076 1077 rw_wlock(&vm_phys_fictitious_reg_lock); 1078 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1079 if (seg->start != start || seg->end != end) { 1080 rw_wunlock(&vm_phys_fictitious_reg_lock); 1081 panic( 1082 "Unregistering not registered fictitious range [%#jx:%#jx]", 1083 (uintmax_t)start, (uintmax_t)end); 1084 } 1085 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1086 rw_wunlock(&vm_phys_fictitious_reg_lock); 1087 free(seg->first_page, M_FICT_PAGES); 1088 free(seg, M_FICT_PAGES); 1089 } 1090 1091 /* 1092 * Free a contiguous, power of two-sized set of physical pages. 1093 * 1094 * The free page queues must be locked. 1095 */ 1096 void 1097 vm_phys_free_pages(vm_page_t m, int order) 1098 { 1099 struct vm_freelist *fl; 1100 struct vm_phys_seg *seg; 1101 vm_paddr_t pa; 1102 vm_page_t m_buddy; 1103 1104 KASSERT(m->order == VM_NFREEORDER, 1105 ("vm_phys_free_pages: page %p has unexpected order %d", 1106 m, m->order)); 1107 KASSERT(m->pool < VM_NFREEPOOL, 1108 ("vm_phys_free_pages: page %p has unexpected pool %d", 1109 m, m->pool)); 1110 KASSERT(order < VM_NFREEORDER, 1111 ("vm_phys_free_pages: order %d is out of range", order)); 1112 seg = &vm_phys_segs[m->segind]; 1113 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1114 if (order < VM_NFREEORDER - 1) { 1115 pa = VM_PAGE_TO_PHYS(m); 1116 do { 1117 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1118 if (pa < seg->start || pa >= seg->end) 1119 break; 1120 m_buddy = &seg->first_page[atop(pa - seg->start)]; 1121 if (m_buddy->order != order) 1122 break; 1123 fl = (*seg->free_queues)[m_buddy->pool]; 1124 vm_freelist_rem(fl, m_buddy, order); 1125 if (m_buddy->pool != m->pool) 1126 vm_phys_set_pool(m->pool, m_buddy, order); 1127 order++; 1128 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1129 m = &seg->first_page[atop(pa - seg->start)]; 1130 } while (order < VM_NFREEORDER - 1); 1131 } 1132 fl = (*seg->free_queues)[m->pool]; 1133 vm_freelist_add(fl, m, order, 1); 1134 } 1135 1136 /* 1137 * Return the largest possible order of a set of pages starting at m. 1138 */ 1139 static int 1140 max_order(vm_page_t m) 1141 { 1142 1143 /* 1144 * Unsigned "min" is used here so that "order" is assigned 1145 * "VM_NFREEORDER - 1" when "m"'s physical address is zero 1146 * or the low-order bits of its physical address are zero 1147 * because the size of a physical address exceeds the size of 1148 * a long. 1149 */ 1150 return (min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, 1151 VM_NFREEORDER - 1)); 1152 } 1153 1154 /* 1155 * Free a contiguous, arbitrarily sized set of physical pages, without 1156 * merging across set boundaries. 1157 * 1158 * The free page queues must be locked. 1159 */ 1160 void 1161 vm_phys_enqueue_contig(vm_page_t m, u_long npages) 1162 { 1163 struct vm_freelist *fl; 1164 struct vm_phys_seg *seg; 1165 vm_page_t m_end; 1166 int order; 1167 1168 /* 1169 * Avoid unnecessary coalescing by freeing the pages in the largest 1170 * possible power-of-two-sized subsets. 1171 */ 1172 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1173 seg = &vm_phys_segs[m->segind]; 1174 fl = (*seg->free_queues)[m->pool]; 1175 m_end = m + npages; 1176 /* Free blocks of increasing size. */ 1177 while ((order = max_order(m)) < VM_NFREEORDER - 1 && 1178 m + (1 << order) <= m_end) { 1179 KASSERT(seg == &vm_phys_segs[m->segind], 1180 ("%s: page range [%p,%p) spans multiple segments", 1181 __func__, m_end - npages, m)); 1182 vm_freelist_add(fl, m, order, 1); 1183 m += 1 << order; 1184 } 1185 /* Free blocks of maximum size. */ 1186 while (m + (1 << order) <= m_end) { 1187 KASSERT(seg == &vm_phys_segs[m->segind], 1188 ("%s: page range [%p,%p) spans multiple segments", 1189 __func__, m_end - npages, m)); 1190 vm_freelist_add(fl, m, order, 1); 1191 m += 1 << order; 1192 } 1193 /* Free blocks of diminishing size. */ 1194 while (m < m_end) { 1195 KASSERT(seg == &vm_phys_segs[m->segind], 1196 ("%s: page range [%p,%p) spans multiple segments", 1197 __func__, m_end - npages, m)); 1198 order = flsl(m_end - m) - 1; 1199 vm_freelist_add(fl, m, order, 1); 1200 m += 1 << order; 1201 } 1202 } 1203 1204 /* 1205 * Free a contiguous, arbitrarily sized set of physical pages. 1206 * 1207 * The free page queues must be locked. 1208 */ 1209 void 1210 vm_phys_free_contig(vm_page_t m, u_long npages) 1211 { 1212 int order_start, order_end; 1213 vm_page_t m_start, m_end; 1214 1215 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1216 1217 m_start = m; 1218 order_start = max_order(m_start); 1219 if (order_start < VM_NFREEORDER - 1) 1220 m_start += 1 << order_start; 1221 m_end = m + npages; 1222 order_end = max_order(m_end); 1223 if (order_end < VM_NFREEORDER - 1) 1224 m_end -= 1 << order_end; 1225 /* 1226 * Avoid unnecessary coalescing by freeing the pages at the start and 1227 * end of the range last. 1228 */ 1229 if (m_start < m_end) 1230 vm_phys_enqueue_contig(m_start, m_end - m_start); 1231 if (order_start < VM_NFREEORDER - 1) 1232 vm_phys_free_pages(m, order_start); 1233 if (order_end < VM_NFREEORDER - 1) 1234 vm_phys_free_pages(m_end, order_end); 1235 } 1236 1237 /* 1238 * Scan physical memory between the specified addresses "low" and "high" for a 1239 * run of contiguous physical pages that satisfy the specified conditions, and 1240 * return the lowest page in the run. The specified "alignment" determines 1241 * the alignment of the lowest physical page in the run. If the specified 1242 * "boundary" is non-zero, then the run of physical pages cannot span a 1243 * physical address that is a multiple of "boundary". 1244 * 1245 * "npages" must be greater than zero. Both "alignment" and "boundary" must 1246 * be a power of two. 1247 */ 1248 vm_page_t 1249 vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1250 u_long alignment, vm_paddr_t boundary, int options) 1251 { 1252 vm_paddr_t pa_end; 1253 vm_page_t m_end, m_run, m_start; 1254 struct vm_phys_seg *seg; 1255 int segind; 1256 1257 KASSERT(npages > 0, ("npages is 0")); 1258 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1259 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1260 if (low >= high) 1261 return (NULL); 1262 for (segind = 0; segind < vm_phys_nsegs; segind++) { 1263 seg = &vm_phys_segs[segind]; 1264 if (seg->domain != domain) 1265 continue; 1266 if (seg->start >= high) 1267 break; 1268 if (low >= seg->end) 1269 continue; 1270 if (low <= seg->start) 1271 m_start = seg->first_page; 1272 else 1273 m_start = &seg->first_page[atop(low - seg->start)]; 1274 if (high < seg->end) 1275 pa_end = high; 1276 else 1277 pa_end = seg->end; 1278 if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) 1279 continue; 1280 m_end = &seg->first_page[atop(pa_end - seg->start)]; 1281 m_run = vm_page_scan_contig(npages, m_start, m_end, 1282 alignment, boundary, options); 1283 if (m_run != NULL) 1284 return (m_run); 1285 } 1286 return (NULL); 1287 } 1288 1289 /* 1290 * Search for the given physical page "m" in the free lists. If the search 1291 * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return 1292 * FALSE, indicating that "m" is not in the free lists. 1293 * 1294 * The free page queues must be locked. 1295 */ 1296 boolean_t 1297 vm_phys_unfree_page(vm_page_t m) 1298 { 1299 struct vm_freelist *fl; 1300 struct vm_phys_seg *seg; 1301 vm_paddr_t pa, pa_half; 1302 vm_page_t m_set, m_tmp; 1303 int order; 1304 1305 /* 1306 * First, find the contiguous, power of two-sized set of free 1307 * physical pages containing the given physical page "m" and 1308 * assign it to "m_set". 1309 */ 1310 seg = &vm_phys_segs[m->segind]; 1311 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1312 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1313 order < VM_NFREEORDER - 1; ) { 1314 order++; 1315 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1316 if (pa >= seg->start) 1317 m_set = &seg->first_page[atop(pa - seg->start)]; 1318 else 1319 return (FALSE); 1320 } 1321 if (m_set->order < order) 1322 return (FALSE); 1323 if (m_set->order == VM_NFREEORDER) 1324 return (FALSE); 1325 KASSERT(m_set->order < VM_NFREEORDER, 1326 ("vm_phys_unfree_page: page %p has unexpected order %d", 1327 m_set, m_set->order)); 1328 1329 /* 1330 * Next, remove "m_set" from the free lists. Finally, extract 1331 * "m" from "m_set" using an iterative algorithm: While "m_set" 1332 * is larger than a page, shrink "m_set" by returning the half 1333 * of "m_set" that does not contain "m" to the free lists. 1334 */ 1335 fl = (*seg->free_queues)[m_set->pool]; 1336 order = m_set->order; 1337 vm_freelist_rem(fl, m_set, order); 1338 while (order > 0) { 1339 order--; 1340 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1341 if (m->phys_addr < pa_half) 1342 m_tmp = &seg->first_page[atop(pa_half - seg->start)]; 1343 else { 1344 m_tmp = m_set; 1345 m_set = &seg->first_page[atop(pa_half - seg->start)]; 1346 } 1347 vm_freelist_add(fl, m_tmp, order, 0); 1348 } 1349 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1350 return (TRUE); 1351 } 1352 1353 /* 1354 * Allocate a contiguous set of physical pages of the given size 1355 * "npages" from the free lists. All of the physical pages must be at 1356 * or above the given physical address "low" and below the given 1357 * physical address "high". The given value "alignment" determines the 1358 * alignment of the first physical page in the set. If the given value 1359 * "boundary" is non-zero, then the set of physical pages cannot cross 1360 * any physical address boundary that is a multiple of that value. Both 1361 * "alignment" and "boundary" must be a power of two. 1362 */ 1363 vm_page_t 1364 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1365 u_long alignment, vm_paddr_t boundary) 1366 { 1367 vm_paddr_t pa_end, pa_start; 1368 vm_page_t m_run; 1369 struct vm_phys_seg *seg; 1370 int segind; 1371 1372 KASSERT(npages > 0, ("npages is 0")); 1373 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1374 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1375 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1376 if (low >= high) 1377 return (NULL); 1378 m_run = NULL; 1379 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1380 seg = &vm_phys_segs[segind]; 1381 if (seg->start >= high || seg->domain != domain) 1382 continue; 1383 if (low >= seg->end) 1384 break; 1385 if (low <= seg->start) 1386 pa_start = seg->start; 1387 else 1388 pa_start = low; 1389 if (high < seg->end) 1390 pa_end = high; 1391 else 1392 pa_end = seg->end; 1393 if (pa_end - pa_start < ptoa(npages)) 1394 continue; 1395 m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, 1396 alignment, boundary); 1397 if (m_run != NULL) 1398 break; 1399 } 1400 return (m_run); 1401 } 1402 1403 /* 1404 * Allocate a run of contiguous physical pages from the free list for the 1405 * specified segment. 1406 */ 1407 static vm_page_t 1408 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, 1409 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1410 { 1411 struct vm_freelist *fl; 1412 vm_paddr_t pa, pa_end, size; 1413 vm_page_t m, m_ret; 1414 u_long npages_end; 1415 int oind, order, pind; 1416 1417 KASSERT(npages > 0, ("npages is 0")); 1418 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1419 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1420 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1421 /* Compute the queue that is the best fit for npages. */ 1422 order = flsl(npages - 1); 1423 /* Search for a run satisfying the specified conditions. */ 1424 size = npages << PAGE_SHIFT; 1425 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; 1426 oind++) { 1427 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1428 fl = (*seg->free_queues)[pind]; 1429 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1430 /* 1431 * Determine if the address range starting at pa 1432 * is within the given range, satisfies the 1433 * given alignment, and does not cross the given 1434 * boundary. 1435 */ 1436 pa = VM_PAGE_TO_PHYS(m_ret); 1437 pa_end = pa + size; 1438 if (pa < low || pa_end > high || 1439 !vm_addr_ok(pa, size, alignment, boundary)) 1440 continue; 1441 1442 /* 1443 * Is the size of this allocation request 1444 * no more than the largest block size? 1445 */ 1446 if (order < VM_NFREEORDER) 1447 goto done; 1448 1449 /* 1450 * Determine if the address range is valid 1451 * (without overflow in pa_end calculation) 1452 * and fits within the segment. 1453 */ 1454 if (pa_end < pa || 1455 pa < seg->start || seg->end < pa_end) 1456 continue; 1457 1458 /* 1459 * Determine if a sufficient number of 1460 * subsequent blocks to satisfy the 1461 * allocation request are free. 1462 */ 1463 do { 1464 pa += 1 << 1465 (PAGE_SHIFT + VM_NFREEORDER - 1); 1466 if (pa >= pa_end) 1467 goto done; 1468 } while (VM_NFREEORDER - 1 == seg->first_page[ 1469 atop(pa - seg->start)].order); 1470 } 1471 } 1472 } 1473 return (NULL); 1474 done: 1475 for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { 1476 fl = (*seg->free_queues)[m->pool]; 1477 vm_freelist_rem(fl, m, oind); 1478 if (m->pool != VM_FREEPOOL_DEFAULT) 1479 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1480 } 1481 /* Return excess pages to the free lists. */ 1482 npages_end = roundup2(npages, 1 << oind); 1483 if (npages < npages_end) { 1484 fl = (*seg->free_queues)[VM_FREEPOOL_DEFAULT]; 1485 vm_phys_enq_range(&m_ret[npages], npages_end - npages, fl, 0); 1486 } 1487 return (m_ret); 1488 } 1489 1490 /* 1491 * Return the index of the first unused slot which may be the terminating 1492 * entry. 1493 */ 1494 static int 1495 vm_phys_avail_count(void) 1496 { 1497 int i; 1498 1499 for (i = 0; phys_avail[i + 1]; i += 2) 1500 continue; 1501 if (i > PHYS_AVAIL_ENTRIES) 1502 panic("Improperly terminated phys_avail %d entries", i); 1503 1504 return (i); 1505 } 1506 1507 /* 1508 * Assert that a phys_avail entry is valid. 1509 */ 1510 static void 1511 vm_phys_avail_check(int i) 1512 { 1513 if (phys_avail[i] & PAGE_MASK) 1514 panic("Unaligned phys_avail[%d]: %#jx", i, 1515 (intmax_t)phys_avail[i]); 1516 if (phys_avail[i+1] & PAGE_MASK) 1517 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1518 (intmax_t)phys_avail[i]); 1519 if (phys_avail[i + 1] < phys_avail[i]) 1520 panic("phys_avail[%d] start %#jx < end %#jx", i, 1521 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1522 } 1523 1524 /* 1525 * Return the index of an overlapping phys_avail entry or -1. 1526 */ 1527 #ifdef NUMA 1528 static int 1529 vm_phys_avail_find(vm_paddr_t pa) 1530 { 1531 int i; 1532 1533 for (i = 0; phys_avail[i + 1]; i += 2) 1534 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1535 return (i); 1536 return (-1); 1537 } 1538 #endif 1539 1540 /* 1541 * Return the index of the largest entry. 1542 */ 1543 int 1544 vm_phys_avail_largest(void) 1545 { 1546 vm_paddr_t sz, largesz; 1547 int largest; 1548 int i; 1549 1550 largest = 0; 1551 largesz = 0; 1552 for (i = 0; phys_avail[i + 1]; i += 2) { 1553 sz = vm_phys_avail_size(i); 1554 if (sz > largesz) { 1555 largesz = sz; 1556 largest = i; 1557 } 1558 } 1559 1560 return (largest); 1561 } 1562 1563 vm_paddr_t 1564 vm_phys_avail_size(int i) 1565 { 1566 1567 return (phys_avail[i + 1] - phys_avail[i]); 1568 } 1569 1570 /* 1571 * Split an entry at the address 'pa'. Return zero on success or errno. 1572 */ 1573 static int 1574 vm_phys_avail_split(vm_paddr_t pa, int i) 1575 { 1576 int cnt; 1577 1578 vm_phys_avail_check(i); 1579 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1580 panic("vm_phys_avail_split: invalid address"); 1581 cnt = vm_phys_avail_count(); 1582 if (cnt >= PHYS_AVAIL_ENTRIES) 1583 return (ENOSPC); 1584 memmove(&phys_avail[i + 2], &phys_avail[i], 1585 (cnt - i) * sizeof(phys_avail[0])); 1586 phys_avail[i + 1] = pa; 1587 phys_avail[i + 2] = pa; 1588 vm_phys_avail_check(i); 1589 vm_phys_avail_check(i+2); 1590 1591 return (0); 1592 } 1593 1594 /* 1595 * Check if a given physical address can be included as part of a crash dump. 1596 */ 1597 bool 1598 vm_phys_is_dumpable(vm_paddr_t pa) 1599 { 1600 vm_page_t m; 1601 int i; 1602 1603 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1604 return ((m->flags & PG_NODUMP) == 0); 1605 1606 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1607 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1608 return (true); 1609 } 1610 return (false); 1611 } 1612 1613 void 1614 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1615 { 1616 struct vm_phys_seg *seg; 1617 1618 if (vm_phys_early_nsegs == -1) 1619 panic("%s: called after initialization", __func__); 1620 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1621 panic("%s: ran out of early segments", __func__); 1622 1623 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1624 seg->start = start; 1625 seg->end = end; 1626 } 1627 1628 /* 1629 * This routine allocates NUMA node specific memory before the page 1630 * allocator is bootstrapped. 1631 */ 1632 vm_paddr_t 1633 vm_phys_early_alloc(int domain, size_t alloc_size) 1634 { 1635 int i, mem_index, biggestone; 1636 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1637 1638 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1639 ("%s: invalid domain index %d", __func__, domain)); 1640 1641 /* 1642 * Search the mem_affinity array for the biggest address 1643 * range in the desired domain. This is used to constrain 1644 * the phys_avail selection below. 1645 */ 1646 biggestsize = 0; 1647 mem_index = 0; 1648 mem_start = 0; 1649 mem_end = -1; 1650 #ifdef NUMA 1651 if (mem_affinity != NULL) { 1652 for (i = 0;; i++) { 1653 size = mem_affinity[i].end - mem_affinity[i].start; 1654 if (size == 0) 1655 break; 1656 if (domain != -1 && mem_affinity[i].domain != domain) 1657 continue; 1658 if (size > biggestsize) { 1659 mem_index = i; 1660 biggestsize = size; 1661 } 1662 } 1663 mem_start = mem_affinity[mem_index].start; 1664 mem_end = mem_affinity[mem_index].end; 1665 } 1666 #endif 1667 1668 /* 1669 * Now find biggest physical segment in within the desired 1670 * numa domain. 1671 */ 1672 biggestsize = 0; 1673 biggestone = 0; 1674 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1675 /* skip regions that are out of range */ 1676 if (phys_avail[i+1] - alloc_size < mem_start || 1677 phys_avail[i+1] > mem_end) 1678 continue; 1679 size = vm_phys_avail_size(i); 1680 if (size > biggestsize) { 1681 biggestone = i; 1682 biggestsize = size; 1683 } 1684 } 1685 alloc_size = round_page(alloc_size); 1686 1687 /* 1688 * Grab single pages from the front to reduce fragmentation. 1689 */ 1690 if (alloc_size == PAGE_SIZE) { 1691 pa = phys_avail[biggestone]; 1692 phys_avail[biggestone] += PAGE_SIZE; 1693 vm_phys_avail_check(biggestone); 1694 return (pa); 1695 } 1696 1697 /* 1698 * Naturally align large allocations. 1699 */ 1700 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1701 if (alloc_size + align > biggestsize) 1702 panic("cannot find a large enough size\n"); 1703 if (align != 0 && 1704 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1705 biggestone) != 0) 1706 /* Wasting memory. */ 1707 phys_avail[biggestone + 1] -= align; 1708 1709 phys_avail[biggestone + 1] -= alloc_size; 1710 vm_phys_avail_check(biggestone); 1711 pa = phys_avail[biggestone + 1]; 1712 return (pa); 1713 } 1714 1715 void 1716 vm_phys_early_startup(void) 1717 { 1718 struct vm_phys_seg *seg; 1719 int i; 1720 1721 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1722 phys_avail[i] = round_page(phys_avail[i]); 1723 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1724 } 1725 1726 for (i = 0; i < vm_phys_early_nsegs; i++) { 1727 seg = &vm_phys_early_segs[i]; 1728 vm_phys_add_seg(seg->start, seg->end); 1729 } 1730 vm_phys_early_nsegs = -1; 1731 1732 #ifdef NUMA 1733 /* Force phys_avail to be split by domain. */ 1734 if (mem_affinity != NULL) { 1735 int idx; 1736 1737 for (i = 0; mem_affinity[i].end != 0; i++) { 1738 idx = vm_phys_avail_find(mem_affinity[i].start); 1739 if (idx != -1 && 1740 phys_avail[idx] != mem_affinity[i].start) 1741 vm_phys_avail_split(mem_affinity[i].start, idx); 1742 idx = vm_phys_avail_find(mem_affinity[i].end); 1743 if (idx != -1 && 1744 phys_avail[idx] != mem_affinity[i].end) 1745 vm_phys_avail_split(mem_affinity[i].end, idx); 1746 } 1747 } 1748 #endif 1749 } 1750 1751 #ifdef DDB 1752 /* 1753 * Show the number of physical pages in each of the free lists. 1754 */ 1755 DB_SHOW_COMMAND(freepages, db_show_freepages) 1756 { 1757 struct vm_freelist *fl; 1758 int flind, oind, pind, dom; 1759 1760 for (dom = 0; dom < vm_ndomains; dom++) { 1761 db_printf("DOMAIN: %d\n", dom); 1762 for (flind = 0; flind < vm_nfreelists; flind++) { 1763 db_printf("FREE LIST %d:\n" 1764 "\n ORDER (SIZE) | NUMBER" 1765 "\n ", flind); 1766 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1767 db_printf(" | POOL %d", pind); 1768 db_printf("\n-- "); 1769 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1770 db_printf("-- -- "); 1771 db_printf("--\n"); 1772 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 1773 db_printf(" %2.2d (%6.6dK)", oind, 1774 1 << (PAGE_SHIFT - 10 + oind)); 1775 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1776 fl = vm_phys_free_queues[dom][flind][pind]; 1777 db_printf(" | %6.6d", fl[oind].lcnt); 1778 } 1779 db_printf("\n"); 1780 } 1781 db_printf("\n"); 1782 } 1783 db_printf("\n"); 1784 } 1785 } 1786 #endif 1787