1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ddb.h" 45 #include "opt_vm.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/domainset.h> 50 #include <sys/lock.h> 51 #include <sys/kernel.h> 52 #include <sys/malloc.h> 53 #include <sys/mutex.h> 54 #include <sys/proc.h> 55 #include <sys/queue.h> 56 #include <sys/rwlock.h> 57 #include <sys/sbuf.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/vmmeter.h> 61 62 #include <ddb/ddb.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_param.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_page.h> 69 #include <vm/vm_phys.h> 70 #include <vm/vm_pagequeue.h> 71 72 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 73 "Too many physsegs."); 74 75 #ifdef NUMA 76 struct mem_affinity __read_mostly *mem_affinity; 77 int __read_mostly *mem_locality; 78 #endif 79 80 int __read_mostly vm_ndomains = 1; 81 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 82 83 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 84 int __read_mostly vm_phys_nsegs; 85 86 struct vm_phys_fictitious_seg; 87 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 88 struct vm_phys_fictitious_seg *); 89 90 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 91 RB_INITIALIZER(_vm_phys_fictitious_tree); 92 93 struct vm_phys_fictitious_seg { 94 RB_ENTRY(vm_phys_fictitious_seg) node; 95 /* Memory region data */ 96 vm_paddr_t start; 97 vm_paddr_t end; 98 vm_page_t first_page; 99 }; 100 101 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 102 vm_phys_fictitious_cmp); 103 104 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 105 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 106 107 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 108 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 109 [VM_NFREEORDER_MAX]; 110 111 static int __read_mostly vm_nfreelists; 112 113 /* 114 * These "avail lists" are globals used to communicate boot-time physical 115 * memory layout to other parts of the kernel. Each physically contiguous 116 * region of memory is defined by a start address at an even index and an 117 * end address at the following odd index. Each list is terminated by a 118 * pair of zero entries. 119 * 120 * dump_avail tells the dump code what regions to include in a crash dump, and 121 * phys_avail is all of the remaining physical memory that is available for 122 * the vm system. 123 * 124 * Initially dump_avail and phys_avail are identical. Boot time memory 125 * allocations remove extents from phys_avail that may still be included 126 * in dumps. 127 */ 128 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 129 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 130 131 /* 132 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 133 */ 134 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 135 136 CTASSERT(VM_FREELIST_DEFAULT == 0); 137 138 #ifdef VM_FREELIST_DMA32 139 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 140 #endif 141 142 /* 143 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 144 * the ordering of the free list boundaries. 145 */ 146 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 147 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 148 #endif 149 150 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 151 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD, 152 NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info"); 153 154 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 155 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, 156 NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); 157 158 #ifdef NUMA 159 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 160 SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD, 161 NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info"); 162 #endif 163 164 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 165 &vm_ndomains, 0, "Number of physical memory domains available."); 166 167 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, 168 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 169 vm_paddr_t boundary); 170 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 171 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 172 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 173 int order, int tail); 174 175 /* 176 * Red-black tree helpers for vm fictitious range management. 177 */ 178 static inline int 179 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 180 struct vm_phys_fictitious_seg *range) 181 { 182 183 KASSERT(range->start != 0 && range->end != 0, 184 ("Invalid range passed on search for vm_fictitious page")); 185 if (p->start >= range->end) 186 return (1); 187 if (p->start < range->start) 188 return (-1); 189 190 return (0); 191 } 192 193 static int 194 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 195 struct vm_phys_fictitious_seg *p2) 196 { 197 198 /* Check if this is a search for a page */ 199 if (p1->end == 0) 200 return (vm_phys_fictitious_in_range(p1, p2)); 201 202 KASSERT(p2->end != 0, 203 ("Invalid range passed as second parameter to vm fictitious comparison")); 204 205 /* Searching to add a new range */ 206 if (p1->end <= p2->start) 207 return (-1); 208 if (p1->start >= p2->end) 209 return (1); 210 211 panic("Trying to add overlapping vm fictitious ranges:\n" 212 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 213 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 214 } 215 216 int 217 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high) 218 { 219 #ifdef NUMA 220 domainset_t mask; 221 int i; 222 223 if (vm_ndomains == 1 || mem_affinity == NULL) 224 return (0); 225 226 DOMAINSET_ZERO(&mask); 227 /* 228 * Check for any memory that overlaps low, high. 229 */ 230 for (i = 0; mem_affinity[i].end != 0; i++) 231 if (mem_affinity[i].start <= high && 232 mem_affinity[i].end >= low) 233 DOMAINSET_SET(mem_affinity[i].domain, &mask); 234 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 235 return (prefer); 236 if (DOMAINSET_EMPTY(&mask)) 237 panic("vm_phys_domain_match: Impossible constraint"); 238 return (DOMAINSET_FFS(&mask) - 1); 239 #else 240 return (0); 241 #endif 242 } 243 244 /* 245 * Outputs the state of the physical memory allocator, specifically, 246 * the amount of physical memory in each free list. 247 */ 248 static int 249 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 250 { 251 struct sbuf sbuf; 252 struct vm_freelist *fl; 253 int dom, error, flind, oind, pind; 254 255 error = sysctl_wire_old_buffer(req, 0); 256 if (error != 0) 257 return (error); 258 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 259 for (dom = 0; dom < vm_ndomains; dom++) { 260 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 261 for (flind = 0; flind < vm_nfreelists; flind++) { 262 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 263 "\n ORDER (SIZE) | NUMBER" 264 "\n ", flind); 265 for (pind = 0; pind < VM_NFREEPOOL; pind++) 266 sbuf_printf(&sbuf, " | POOL %d", pind); 267 sbuf_printf(&sbuf, "\n-- "); 268 for (pind = 0; pind < VM_NFREEPOOL; pind++) 269 sbuf_printf(&sbuf, "-- -- "); 270 sbuf_printf(&sbuf, "--\n"); 271 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 272 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 273 1 << (PAGE_SHIFT - 10 + oind)); 274 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 275 fl = vm_phys_free_queues[dom][flind][pind]; 276 sbuf_printf(&sbuf, " | %6d", 277 fl[oind].lcnt); 278 } 279 sbuf_printf(&sbuf, "\n"); 280 } 281 } 282 } 283 error = sbuf_finish(&sbuf); 284 sbuf_delete(&sbuf); 285 return (error); 286 } 287 288 /* 289 * Outputs the set of physical memory segments. 290 */ 291 static int 292 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 293 { 294 struct sbuf sbuf; 295 struct vm_phys_seg *seg; 296 int error, segind; 297 298 error = sysctl_wire_old_buffer(req, 0); 299 if (error != 0) 300 return (error); 301 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 302 for (segind = 0; segind < vm_phys_nsegs; segind++) { 303 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 304 seg = &vm_phys_segs[segind]; 305 sbuf_printf(&sbuf, "start: %#jx\n", 306 (uintmax_t)seg->start); 307 sbuf_printf(&sbuf, "end: %#jx\n", 308 (uintmax_t)seg->end); 309 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 310 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 311 } 312 error = sbuf_finish(&sbuf); 313 sbuf_delete(&sbuf); 314 return (error); 315 } 316 317 /* 318 * Return affinity, or -1 if there's no affinity information. 319 */ 320 int 321 vm_phys_mem_affinity(int f, int t) 322 { 323 324 #ifdef NUMA 325 if (mem_locality == NULL) 326 return (-1); 327 if (f >= vm_ndomains || t >= vm_ndomains) 328 return (-1); 329 return (mem_locality[f * vm_ndomains + t]); 330 #else 331 return (-1); 332 #endif 333 } 334 335 #ifdef NUMA 336 /* 337 * Outputs the VM locality table. 338 */ 339 static int 340 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 341 { 342 struct sbuf sbuf; 343 int error, i, j; 344 345 error = sysctl_wire_old_buffer(req, 0); 346 if (error != 0) 347 return (error); 348 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 349 350 sbuf_printf(&sbuf, "\n"); 351 352 for (i = 0; i < vm_ndomains; i++) { 353 sbuf_printf(&sbuf, "%d: ", i); 354 for (j = 0; j < vm_ndomains; j++) { 355 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 356 } 357 sbuf_printf(&sbuf, "\n"); 358 } 359 error = sbuf_finish(&sbuf); 360 sbuf_delete(&sbuf); 361 return (error); 362 } 363 #endif 364 365 static void 366 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 367 { 368 369 m->order = order; 370 if (tail) 371 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 372 else 373 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 374 fl[order].lcnt++; 375 } 376 377 static void 378 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 379 { 380 381 TAILQ_REMOVE(&fl[order].pl, m, listq); 382 fl[order].lcnt--; 383 m->order = VM_NFREEORDER; 384 } 385 386 /* 387 * Create a physical memory segment. 388 */ 389 static void 390 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 391 { 392 struct vm_phys_seg *seg; 393 394 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 395 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 396 KASSERT(domain >= 0 && domain < vm_ndomains, 397 ("vm_phys_create_seg: invalid domain provided")); 398 seg = &vm_phys_segs[vm_phys_nsegs++]; 399 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 400 *seg = *(seg - 1); 401 seg--; 402 } 403 seg->start = start; 404 seg->end = end; 405 seg->domain = domain; 406 } 407 408 static void 409 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 410 { 411 #ifdef NUMA 412 int i; 413 414 if (mem_affinity == NULL) { 415 _vm_phys_create_seg(start, end, 0); 416 return; 417 } 418 419 for (i = 0;; i++) { 420 if (mem_affinity[i].end == 0) 421 panic("Reached end of affinity info"); 422 if (mem_affinity[i].end <= start) 423 continue; 424 if (mem_affinity[i].start > start) 425 panic("No affinity info for start %jx", 426 (uintmax_t)start); 427 if (mem_affinity[i].end >= end) { 428 _vm_phys_create_seg(start, end, 429 mem_affinity[i].domain); 430 break; 431 } 432 _vm_phys_create_seg(start, mem_affinity[i].end, 433 mem_affinity[i].domain); 434 start = mem_affinity[i].end; 435 } 436 #else 437 _vm_phys_create_seg(start, end, 0); 438 #endif 439 } 440 441 /* 442 * Add a physical memory segment. 443 */ 444 void 445 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 446 { 447 vm_paddr_t paddr; 448 449 KASSERT((start & PAGE_MASK) == 0, 450 ("vm_phys_define_seg: start is not page aligned")); 451 KASSERT((end & PAGE_MASK) == 0, 452 ("vm_phys_define_seg: end is not page aligned")); 453 454 /* 455 * Split the physical memory segment if it spans two or more free 456 * list boundaries. 457 */ 458 paddr = start; 459 #ifdef VM_FREELIST_LOWMEM 460 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 461 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 462 paddr = VM_LOWMEM_BOUNDARY; 463 } 464 #endif 465 #ifdef VM_FREELIST_DMA32 466 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 467 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 468 paddr = VM_DMA32_BOUNDARY; 469 } 470 #endif 471 vm_phys_create_seg(paddr, end); 472 } 473 474 /* 475 * Initialize the physical memory allocator. 476 * 477 * Requires that vm_page_array is initialized! 478 */ 479 void 480 vm_phys_init(void) 481 { 482 struct vm_freelist *fl; 483 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 484 u_long npages; 485 int dom, flind, freelist, oind, pind, segind; 486 487 /* 488 * Compute the number of free lists, and generate the mapping from the 489 * manifest constants VM_FREELIST_* to the free list indices. 490 * 491 * Initially, the entries of vm_freelist_to_flind[] are set to either 492 * 0 or 1 to indicate which free lists should be created. 493 */ 494 npages = 0; 495 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 496 seg = &vm_phys_segs[segind]; 497 #ifdef VM_FREELIST_LOWMEM 498 if (seg->end <= VM_LOWMEM_BOUNDARY) 499 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 500 else 501 #endif 502 #ifdef VM_FREELIST_DMA32 503 if ( 504 #ifdef VM_DMA32_NPAGES_THRESHOLD 505 /* 506 * Create the DMA32 free list only if the amount of 507 * physical memory above physical address 4G exceeds the 508 * given threshold. 509 */ 510 npages > VM_DMA32_NPAGES_THRESHOLD && 511 #endif 512 seg->end <= VM_DMA32_BOUNDARY) 513 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 514 else 515 #endif 516 { 517 npages += atop(seg->end - seg->start); 518 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 519 } 520 } 521 /* Change each entry into a running total of the free lists. */ 522 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 523 vm_freelist_to_flind[freelist] += 524 vm_freelist_to_flind[freelist - 1]; 525 } 526 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 527 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 528 /* Change each entry into a free list index. */ 529 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 530 vm_freelist_to_flind[freelist]--; 531 532 /* 533 * Initialize the first_page and free_queues fields of each physical 534 * memory segment. 535 */ 536 #ifdef VM_PHYSSEG_SPARSE 537 npages = 0; 538 #endif 539 for (segind = 0; segind < vm_phys_nsegs; segind++) { 540 seg = &vm_phys_segs[segind]; 541 #ifdef VM_PHYSSEG_SPARSE 542 seg->first_page = &vm_page_array[npages]; 543 npages += atop(seg->end - seg->start); 544 #else 545 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 546 #endif 547 #ifdef VM_FREELIST_LOWMEM 548 if (seg->end <= VM_LOWMEM_BOUNDARY) { 549 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 550 KASSERT(flind >= 0, 551 ("vm_phys_init: LOWMEM flind < 0")); 552 } else 553 #endif 554 #ifdef VM_FREELIST_DMA32 555 if (seg->end <= VM_DMA32_BOUNDARY) { 556 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 557 KASSERT(flind >= 0, 558 ("vm_phys_init: DMA32 flind < 0")); 559 } else 560 #endif 561 { 562 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 563 KASSERT(flind >= 0, 564 ("vm_phys_init: DEFAULT flind < 0")); 565 } 566 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 567 } 568 569 /* 570 * Coalesce physical memory segments that are contiguous and share the 571 * same per-domain free queues. 572 */ 573 prev_seg = vm_phys_segs; 574 seg = &vm_phys_segs[1]; 575 end_seg = &vm_phys_segs[vm_phys_nsegs]; 576 while (seg < end_seg) { 577 if (prev_seg->end == seg->start && 578 prev_seg->free_queues == seg->free_queues) { 579 prev_seg->end = seg->end; 580 KASSERT(prev_seg->domain == seg->domain, 581 ("vm_phys_init: free queues cannot span domains")); 582 vm_phys_nsegs--; 583 end_seg--; 584 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 585 *tmp_seg = *(tmp_seg + 1); 586 } else { 587 prev_seg = seg; 588 seg++; 589 } 590 } 591 592 /* 593 * Initialize the free queues. 594 */ 595 for (dom = 0; dom < vm_ndomains; dom++) { 596 for (flind = 0; flind < vm_nfreelists; flind++) { 597 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 598 fl = vm_phys_free_queues[dom][flind][pind]; 599 for (oind = 0; oind < VM_NFREEORDER; oind++) 600 TAILQ_INIT(&fl[oind].pl); 601 } 602 } 603 } 604 605 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 606 } 607 608 /* 609 * Register info about the NUMA topology of the system. 610 * 611 * Invoked by platform-dependent code prior to vm_phys_init(). 612 */ 613 void 614 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, 615 int *locality) 616 { 617 #ifdef NUMA 618 int d, i; 619 620 /* 621 * For now the only override value that we support is 1, which 622 * effectively disables NUMA-awareness in the allocators. 623 */ 624 d = 0; 625 TUNABLE_INT_FETCH("vm.numa.disabled", &d); 626 if (d) 627 ndomains = 1; 628 629 if (ndomains > 1) { 630 vm_ndomains = ndomains; 631 mem_affinity = affinity; 632 mem_locality = locality; 633 } 634 635 for (i = 0; i < vm_ndomains; i++) 636 DOMAINSET_SET(i, &all_domains); 637 #else 638 (void)ndomains; 639 (void)affinity; 640 (void)locality; 641 #endif 642 } 643 644 int 645 _vm_phys_domain(vm_paddr_t pa) 646 { 647 #ifdef NUMA 648 int i; 649 650 if (vm_ndomains == 1 || mem_affinity == NULL) 651 return (0); 652 653 /* 654 * Check for any memory that overlaps. 655 */ 656 for (i = 0; mem_affinity[i].end != 0; i++) 657 if (mem_affinity[i].start <= pa && 658 mem_affinity[i].end >= pa) 659 return (mem_affinity[i].domain); 660 #endif 661 return (0); 662 } 663 664 /* 665 * Split a contiguous, power of two-sized set of physical pages. 666 * 667 * When this function is called by a page allocation function, the caller 668 * should request insertion at the head unless the order [order, oind) queues 669 * are known to be empty. The objective being to reduce the likelihood of 670 * long-term fragmentation by promoting contemporaneous allocation and 671 * (hopefully) deallocation. 672 */ 673 static __inline void 674 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 675 int tail) 676 { 677 vm_page_t m_buddy; 678 679 while (oind > order) { 680 oind--; 681 m_buddy = &m[1 << oind]; 682 KASSERT(m_buddy->order == VM_NFREEORDER, 683 ("vm_phys_split_pages: page %p has unexpected order %d", 684 m_buddy, m_buddy->order)); 685 vm_freelist_add(fl, m_buddy, oind, tail); 686 } 687 } 688 689 /* 690 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 691 * and sized set to the specified free list. 692 * 693 * When this function is called by a page allocation function, the caller 694 * should request insertion at the head unless the lower-order queues are 695 * known to be empty. The objective being to reduce the likelihood of long- 696 * term fragmentation by promoting contemporaneous allocation and (hopefully) 697 * deallocation. 698 * 699 * The physical page m's buddy must not be free. 700 */ 701 static void 702 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 703 { 704 u_int n; 705 int order; 706 707 KASSERT(npages > 0, ("vm_phys_enq_range: npages is 0")); 708 KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 709 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 710 ("vm_phys_enq_range: page %p and npages %u are misaligned", 711 m, npages)); 712 do { 713 KASSERT(m->order == VM_NFREEORDER, 714 ("vm_phys_enq_range: page %p has unexpected order %d", 715 m, m->order)); 716 order = ffs(npages) - 1; 717 KASSERT(order < VM_NFREEORDER, 718 ("vm_phys_enq_range: order %d is out of range", order)); 719 vm_freelist_add(fl, m, order, tail); 720 n = 1 << order; 721 m += n; 722 npages -= n; 723 } while (npages > 0); 724 } 725 726 /* 727 * Tries to allocate the specified number of pages from the specified pool 728 * within the specified domain. Returns the actual number of allocated pages 729 * and a pointer to each page through the array ma[]. 730 * 731 * The returned pages may not be physically contiguous. However, in contrast 732 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 733 * calling this function once to allocate the desired number of pages will 734 * avoid wasted time in vm_phys_split_pages(). 735 * 736 * The free page queues for the specified domain must be locked. 737 */ 738 int 739 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 740 { 741 struct vm_freelist *alt, *fl; 742 vm_page_t m; 743 int avail, end, flind, freelist, i, need, oind, pind; 744 745 KASSERT(domain >= 0 && domain < vm_ndomains, 746 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 747 KASSERT(pool < VM_NFREEPOOL, 748 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 749 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 750 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 751 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 752 i = 0; 753 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 754 flind = vm_freelist_to_flind[freelist]; 755 if (flind < 0) 756 continue; 757 fl = vm_phys_free_queues[domain][flind][pool]; 758 for (oind = 0; oind < VM_NFREEORDER; oind++) { 759 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 760 vm_freelist_rem(fl, m, oind); 761 avail = 1 << oind; 762 need = imin(npages - i, avail); 763 for (end = i + need; i < end;) 764 ma[i++] = m++; 765 if (need < avail) { 766 /* 767 * Return excess pages to fl. Its 768 * order [0, oind) queues are empty. 769 */ 770 vm_phys_enq_range(m, avail - need, fl, 771 1); 772 return (npages); 773 } else if (i == npages) 774 return (npages); 775 } 776 } 777 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 778 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 779 alt = vm_phys_free_queues[domain][flind][pind]; 780 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 781 NULL) { 782 vm_freelist_rem(alt, m, oind); 783 vm_phys_set_pool(pool, m, oind); 784 avail = 1 << oind; 785 need = imin(npages - i, avail); 786 for (end = i + need; i < end;) 787 ma[i++] = m++; 788 if (need < avail) { 789 /* 790 * Return excess pages to fl. 791 * Its order [0, oind) queues 792 * are empty. 793 */ 794 vm_phys_enq_range(m, avail - 795 need, fl, 1); 796 return (npages); 797 } else if (i == npages) 798 return (npages); 799 } 800 } 801 } 802 } 803 return (i); 804 } 805 806 /* 807 * Allocate a contiguous, power of two-sized set of physical pages 808 * from the free lists. 809 * 810 * The free page queues must be locked. 811 */ 812 vm_page_t 813 vm_phys_alloc_pages(int domain, int pool, int order) 814 { 815 vm_page_t m; 816 int freelist; 817 818 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 819 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 820 if (m != NULL) 821 return (m); 822 } 823 return (NULL); 824 } 825 826 /* 827 * Allocate a contiguous, power of two-sized set of physical pages from the 828 * specified free list. The free list must be specified using one of the 829 * manifest constants VM_FREELIST_*. 830 * 831 * The free page queues must be locked. 832 */ 833 vm_page_t 834 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 835 { 836 struct vm_freelist *alt, *fl; 837 vm_page_t m; 838 int oind, pind, flind; 839 840 KASSERT(domain >= 0 && domain < vm_ndomains, 841 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 842 domain)); 843 KASSERT(freelist < VM_NFREELIST, 844 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 845 freelist)); 846 KASSERT(pool < VM_NFREEPOOL, 847 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 848 KASSERT(order < VM_NFREEORDER, 849 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 850 851 flind = vm_freelist_to_flind[freelist]; 852 /* Check if freelist is present */ 853 if (flind < 0) 854 return (NULL); 855 856 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 857 fl = &vm_phys_free_queues[domain][flind][pool][0]; 858 for (oind = order; oind < VM_NFREEORDER; oind++) { 859 m = TAILQ_FIRST(&fl[oind].pl); 860 if (m != NULL) { 861 vm_freelist_rem(fl, m, oind); 862 /* The order [order, oind) queues are empty. */ 863 vm_phys_split_pages(m, oind, fl, order, 1); 864 return (m); 865 } 866 } 867 868 /* 869 * The given pool was empty. Find the largest 870 * contiguous, power-of-two-sized set of pages in any 871 * pool. Transfer these pages to the given pool, and 872 * use them to satisfy the allocation. 873 */ 874 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 875 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 876 alt = &vm_phys_free_queues[domain][flind][pind][0]; 877 m = TAILQ_FIRST(&alt[oind].pl); 878 if (m != NULL) { 879 vm_freelist_rem(alt, m, oind); 880 vm_phys_set_pool(pool, m, oind); 881 /* The order [order, oind) queues are empty. */ 882 vm_phys_split_pages(m, oind, fl, order, 1); 883 return (m); 884 } 885 } 886 } 887 return (NULL); 888 } 889 890 /* 891 * Find the vm_page corresponding to the given physical address. 892 */ 893 vm_page_t 894 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 895 { 896 struct vm_phys_seg *seg; 897 int segind; 898 899 for (segind = 0; segind < vm_phys_nsegs; segind++) { 900 seg = &vm_phys_segs[segind]; 901 if (pa >= seg->start && pa < seg->end) 902 return (&seg->first_page[atop(pa - seg->start)]); 903 } 904 return (NULL); 905 } 906 907 vm_page_t 908 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 909 { 910 struct vm_phys_fictitious_seg tmp, *seg; 911 vm_page_t m; 912 913 m = NULL; 914 tmp.start = pa; 915 tmp.end = 0; 916 917 rw_rlock(&vm_phys_fictitious_reg_lock); 918 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 919 rw_runlock(&vm_phys_fictitious_reg_lock); 920 if (seg == NULL) 921 return (NULL); 922 923 m = &seg->first_page[atop(pa - seg->start)]; 924 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 925 926 return (m); 927 } 928 929 static inline void 930 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 931 long page_count, vm_memattr_t memattr) 932 { 933 long i; 934 935 bzero(range, page_count * sizeof(*range)); 936 for (i = 0; i < page_count; i++) { 937 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 938 range[i].oflags &= ~VPO_UNMANAGED; 939 range[i].busy_lock = VPB_UNBUSIED; 940 } 941 } 942 943 int 944 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 945 vm_memattr_t memattr) 946 { 947 struct vm_phys_fictitious_seg *seg; 948 vm_page_t fp; 949 long page_count; 950 #ifdef VM_PHYSSEG_DENSE 951 long pi, pe; 952 long dpage_count; 953 #endif 954 955 KASSERT(start < end, 956 ("Start of segment isn't less than end (start: %jx end: %jx)", 957 (uintmax_t)start, (uintmax_t)end)); 958 959 page_count = (end - start) / PAGE_SIZE; 960 961 #ifdef VM_PHYSSEG_DENSE 962 pi = atop(start); 963 pe = atop(end); 964 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 965 fp = &vm_page_array[pi - first_page]; 966 if ((pe - first_page) > vm_page_array_size) { 967 /* 968 * We have a segment that starts inside 969 * of vm_page_array, but ends outside of it. 970 * 971 * Use vm_page_array pages for those that are 972 * inside of the vm_page_array range, and 973 * allocate the remaining ones. 974 */ 975 dpage_count = vm_page_array_size - (pi - first_page); 976 vm_phys_fictitious_init_range(fp, start, dpage_count, 977 memattr); 978 page_count -= dpage_count; 979 start += ptoa(dpage_count); 980 goto alloc; 981 } 982 /* 983 * We can allocate the full range from vm_page_array, 984 * so there's no need to register the range in the tree. 985 */ 986 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 987 return (0); 988 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 989 /* 990 * We have a segment that ends inside of vm_page_array, 991 * but starts outside of it. 992 */ 993 fp = &vm_page_array[0]; 994 dpage_count = pe - first_page; 995 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 996 memattr); 997 end -= ptoa(dpage_count); 998 page_count -= dpage_count; 999 goto alloc; 1000 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1001 /* 1002 * Trying to register a fictitious range that expands before 1003 * and after vm_page_array. 1004 */ 1005 return (EINVAL); 1006 } else { 1007 alloc: 1008 #endif 1009 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1010 M_WAITOK); 1011 #ifdef VM_PHYSSEG_DENSE 1012 } 1013 #endif 1014 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1015 1016 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1017 seg->start = start; 1018 seg->end = end; 1019 seg->first_page = fp; 1020 1021 rw_wlock(&vm_phys_fictitious_reg_lock); 1022 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1023 rw_wunlock(&vm_phys_fictitious_reg_lock); 1024 1025 return (0); 1026 } 1027 1028 void 1029 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1030 { 1031 struct vm_phys_fictitious_seg *seg, tmp; 1032 #ifdef VM_PHYSSEG_DENSE 1033 long pi, pe; 1034 #endif 1035 1036 KASSERT(start < end, 1037 ("Start of segment isn't less than end (start: %jx end: %jx)", 1038 (uintmax_t)start, (uintmax_t)end)); 1039 1040 #ifdef VM_PHYSSEG_DENSE 1041 pi = atop(start); 1042 pe = atop(end); 1043 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1044 if ((pe - first_page) <= vm_page_array_size) { 1045 /* 1046 * This segment was allocated using vm_page_array 1047 * only, there's nothing to do since those pages 1048 * were never added to the tree. 1049 */ 1050 return; 1051 } 1052 /* 1053 * We have a segment that starts inside 1054 * of vm_page_array, but ends outside of it. 1055 * 1056 * Calculate how many pages were added to the 1057 * tree and free them. 1058 */ 1059 start = ptoa(first_page + vm_page_array_size); 1060 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1061 /* 1062 * We have a segment that ends inside of vm_page_array, 1063 * but starts outside of it. 1064 */ 1065 end = ptoa(first_page); 1066 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1067 /* Since it's not possible to register such a range, panic. */ 1068 panic( 1069 "Unregistering not registered fictitious range [%#jx:%#jx]", 1070 (uintmax_t)start, (uintmax_t)end); 1071 } 1072 #endif 1073 tmp.start = start; 1074 tmp.end = 0; 1075 1076 rw_wlock(&vm_phys_fictitious_reg_lock); 1077 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1078 if (seg->start != start || seg->end != end) { 1079 rw_wunlock(&vm_phys_fictitious_reg_lock); 1080 panic( 1081 "Unregistering not registered fictitious range [%#jx:%#jx]", 1082 (uintmax_t)start, (uintmax_t)end); 1083 } 1084 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1085 rw_wunlock(&vm_phys_fictitious_reg_lock); 1086 free(seg->first_page, M_FICT_PAGES); 1087 free(seg, M_FICT_PAGES); 1088 } 1089 1090 /* 1091 * Free a contiguous, power of two-sized set of physical pages. 1092 * 1093 * The free page queues must be locked. 1094 */ 1095 void 1096 vm_phys_free_pages(vm_page_t m, int order) 1097 { 1098 struct vm_freelist *fl; 1099 struct vm_phys_seg *seg; 1100 vm_paddr_t pa; 1101 vm_page_t m_buddy; 1102 1103 KASSERT(m->order == VM_NFREEORDER, 1104 ("vm_phys_free_pages: page %p has unexpected order %d", 1105 m, m->order)); 1106 KASSERT(m->pool < VM_NFREEPOOL, 1107 ("vm_phys_free_pages: page %p has unexpected pool %d", 1108 m, m->pool)); 1109 KASSERT(order < VM_NFREEORDER, 1110 ("vm_phys_free_pages: order %d is out of range", order)); 1111 seg = &vm_phys_segs[m->segind]; 1112 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1113 if (order < VM_NFREEORDER - 1) { 1114 pa = VM_PAGE_TO_PHYS(m); 1115 do { 1116 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1117 if (pa < seg->start || pa >= seg->end) 1118 break; 1119 m_buddy = &seg->first_page[atop(pa - seg->start)]; 1120 if (m_buddy->order != order) 1121 break; 1122 fl = (*seg->free_queues)[m_buddy->pool]; 1123 vm_freelist_rem(fl, m_buddy, order); 1124 if (m_buddy->pool != m->pool) 1125 vm_phys_set_pool(m->pool, m_buddy, order); 1126 order++; 1127 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1128 m = &seg->first_page[atop(pa - seg->start)]; 1129 } while (order < VM_NFREEORDER - 1); 1130 } 1131 fl = (*seg->free_queues)[m->pool]; 1132 vm_freelist_add(fl, m, order, 1); 1133 } 1134 1135 /* 1136 * Return the largest possible order of a set of pages starting at m. 1137 */ 1138 static int 1139 max_order(vm_page_t m) 1140 { 1141 1142 /* 1143 * Unsigned "min" is used here so that "order" is assigned 1144 * "VM_NFREEORDER - 1" when "m"'s physical address is zero 1145 * or the low-order bits of its physical address are zero 1146 * because the size of a physical address exceeds the size of 1147 * a long. 1148 */ 1149 return (min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, 1150 VM_NFREEORDER - 1)); 1151 } 1152 1153 /* 1154 * Free a contiguous, arbitrarily sized set of physical pages, without 1155 * merging across set boundaries. 1156 * 1157 * The free page queues must be locked. 1158 */ 1159 void 1160 vm_phys_enqueue_contig(vm_page_t m, u_long npages) 1161 { 1162 struct vm_freelist *fl; 1163 struct vm_phys_seg *seg; 1164 vm_page_t m_end; 1165 int order; 1166 1167 /* 1168 * Avoid unnecessary coalescing by freeing the pages in the largest 1169 * possible power-of-two-sized subsets. 1170 */ 1171 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1172 seg = &vm_phys_segs[m->segind]; 1173 fl = (*seg->free_queues)[m->pool]; 1174 m_end = m + npages; 1175 /* Free blocks of increasing size. */ 1176 while ((order = max_order(m)) < VM_NFREEORDER - 1 && 1177 m + (1 << order) <= m_end) { 1178 KASSERT(seg == &vm_phys_segs[m->segind], 1179 ("%s: page range [%p,%p) spans multiple segments", 1180 __func__, m_end - npages, m)); 1181 vm_freelist_add(fl, m, order, 1); 1182 m += 1 << order; 1183 } 1184 /* Free blocks of maximum size. */ 1185 while (m + (1 << order) <= m_end) { 1186 KASSERT(seg == &vm_phys_segs[m->segind], 1187 ("%s: page range [%p,%p) spans multiple segments", 1188 __func__, m_end - npages, m)); 1189 vm_freelist_add(fl, m, order, 1); 1190 m += 1 << order; 1191 } 1192 /* Free blocks of diminishing size. */ 1193 while (m < m_end) { 1194 KASSERT(seg == &vm_phys_segs[m->segind], 1195 ("%s: page range [%p,%p) spans multiple segments", 1196 __func__, m_end - npages, m)); 1197 order = flsl(m_end - m) - 1; 1198 vm_freelist_add(fl, m, order, 1); 1199 m += 1 << order; 1200 } 1201 } 1202 1203 /* 1204 * Free a contiguous, arbitrarily sized set of physical pages. 1205 * 1206 * The free page queues must be locked. 1207 */ 1208 void 1209 vm_phys_free_contig(vm_page_t m, u_long npages) 1210 { 1211 int order_start, order_end; 1212 vm_page_t m_start, m_end; 1213 1214 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1215 1216 m_start = m; 1217 order_start = max_order(m_start); 1218 if (order_start < VM_NFREEORDER - 1) 1219 m_start += 1 << order_start; 1220 m_end = m + npages; 1221 order_end = max_order(m_end); 1222 if (order_end < VM_NFREEORDER - 1) 1223 m_end -= 1 << order_end; 1224 /* 1225 * Avoid unnecessary coalescing by freeing the pages at the start and 1226 * end of the range last. 1227 */ 1228 if (m_start < m_end) 1229 vm_phys_enqueue_contig(m_start, m_end - m_start); 1230 if (order_start < VM_NFREEORDER - 1) 1231 vm_phys_free_pages(m, order_start); 1232 if (order_end < VM_NFREEORDER - 1) 1233 vm_phys_free_pages(m_end, order_end); 1234 } 1235 1236 /* 1237 * Scan physical memory between the specified addresses "low" and "high" for a 1238 * run of contiguous physical pages that satisfy the specified conditions, and 1239 * return the lowest page in the run. The specified "alignment" determines 1240 * the alignment of the lowest physical page in the run. If the specified 1241 * "boundary" is non-zero, then the run of physical pages cannot span a 1242 * physical address that is a multiple of "boundary". 1243 * 1244 * "npages" must be greater than zero. Both "alignment" and "boundary" must 1245 * be a power of two. 1246 */ 1247 vm_page_t 1248 vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1249 u_long alignment, vm_paddr_t boundary, int options) 1250 { 1251 vm_paddr_t pa_end; 1252 vm_page_t m_end, m_run, m_start; 1253 struct vm_phys_seg *seg; 1254 int segind; 1255 1256 KASSERT(npages > 0, ("npages is 0")); 1257 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1258 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1259 if (low >= high) 1260 return (NULL); 1261 for (segind = 0; segind < vm_phys_nsegs; segind++) { 1262 seg = &vm_phys_segs[segind]; 1263 if (seg->domain != domain) 1264 continue; 1265 if (seg->start >= high) 1266 break; 1267 if (low >= seg->end) 1268 continue; 1269 if (low <= seg->start) 1270 m_start = seg->first_page; 1271 else 1272 m_start = &seg->first_page[atop(low - seg->start)]; 1273 if (high < seg->end) 1274 pa_end = high; 1275 else 1276 pa_end = seg->end; 1277 if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) 1278 continue; 1279 m_end = &seg->first_page[atop(pa_end - seg->start)]; 1280 m_run = vm_page_scan_contig(npages, m_start, m_end, 1281 alignment, boundary, options); 1282 if (m_run != NULL) 1283 return (m_run); 1284 } 1285 return (NULL); 1286 } 1287 1288 /* 1289 * Set the pool for a contiguous, power of two-sized set of physical pages. 1290 */ 1291 void 1292 vm_phys_set_pool(int pool, vm_page_t m, int order) 1293 { 1294 vm_page_t m_tmp; 1295 1296 for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 1297 m_tmp->pool = pool; 1298 } 1299 1300 /* 1301 * Search for the given physical page "m" in the free lists. If the search 1302 * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return 1303 * FALSE, indicating that "m" is not in the free lists. 1304 * 1305 * The free page queues must be locked. 1306 */ 1307 boolean_t 1308 vm_phys_unfree_page(vm_page_t m) 1309 { 1310 struct vm_freelist *fl; 1311 struct vm_phys_seg *seg; 1312 vm_paddr_t pa, pa_half; 1313 vm_page_t m_set, m_tmp; 1314 int order; 1315 1316 /* 1317 * First, find the contiguous, power of two-sized set of free 1318 * physical pages containing the given physical page "m" and 1319 * assign it to "m_set". 1320 */ 1321 seg = &vm_phys_segs[m->segind]; 1322 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1323 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1324 order < VM_NFREEORDER - 1; ) { 1325 order++; 1326 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1327 if (pa >= seg->start) 1328 m_set = &seg->first_page[atop(pa - seg->start)]; 1329 else 1330 return (FALSE); 1331 } 1332 if (m_set->order < order) 1333 return (FALSE); 1334 if (m_set->order == VM_NFREEORDER) 1335 return (FALSE); 1336 KASSERT(m_set->order < VM_NFREEORDER, 1337 ("vm_phys_unfree_page: page %p has unexpected order %d", 1338 m_set, m_set->order)); 1339 1340 /* 1341 * Next, remove "m_set" from the free lists. Finally, extract 1342 * "m" from "m_set" using an iterative algorithm: While "m_set" 1343 * is larger than a page, shrink "m_set" by returning the half 1344 * of "m_set" that does not contain "m" to the free lists. 1345 */ 1346 fl = (*seg->free_queues)[m_set->pool]; 1347 order = m_set->order; 1348 vm_freelist_rem(fl, m_set, order); 1349 while (order > 0) { 1350 order--; 1351 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1352 if (m->phys_addr < pa_half) 1353 m_tmp = &seg->first_page[atop(pa_half - seg->start)]; 1354 else { 1355 m_tmp = m_set; 1356 m_set = &seg->first_page[atop(pa_half - seg->start)]; 1357 } 1358 vm_freelist_add(fl, m_tmp, order, 0); 1359 } 1360 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1361 return (TRUE); 1362 } 1363 1364 /* 1365 * Allocate a contiguous set of physical pages of the given size 1366 * "npages" from the free lists. All of the physical pages must be at 1367 * or above the given physical address "low" and below the given 1368 * physical address "high". The given value "alignment" determines the 1369 * alignment of the first physical page in the set. If the given value 1370 * "boundary" is non-zero, then the set of physical pages cannot cross 1371 * any physical address boundary that is a multiple of that value. Both 1372 * "alignment" and "boundary" must be a power of two. 1373 */ 1374 vm_page_t 1375 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1376 u_long alignment, vm_paddr_t boundary) 1377 { 1378 vm_paddr_t pa_end, pa_start; 1379 vm_page_t m_run; 1380 struct vm_phys_seg *seg; 1381 int segind; 1382 1383 KASSERT(npages > 0, ("npages is 0")); 1384 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1385 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1386 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1387 if (low >= high) 1388 return (NULL); 1389 m_run = NULL; 1390 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1391 seg = &vm_phys_segs[segind]; 1392 if (seg->start >= high || seg->domain != domain) 1393 continue; 1394 if (low >= seg->end) 1395 break; 1396 if (low <= seg->start) 1397 pa_start = seg->start; 1398 else 1399 pa_start = low; 1400 if (high < seg->end) 1401 pa_end = high; 1402 else 1403 pa_end = seg->end; 1404 if (pa_end - pa_start < ptoa(npages)) 1405 continue; 1406 m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, 1407 alignment, boundary); 1408 if (m_run != NULL) 1409 break; 1410 } 1411 return (m_run); 1412 } 1413 1414 /* 1415 * Allocate a run of contiguous physical pages from the free list for the 1416 * specified segment. 1417 */ 1418 static vm_page_t 1419 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, 1420 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1421 { 1422 struct vm_freelist *fl; 1423 vm_paddr_t pa, pa_end, size; 1424 vm_page_t m, m_ret; 1425 u_long npages_end; 1426 int oind, order, pind; 1427 1428 KASSERT(npages > 0, ("npages is 0")); 1429 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1430 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1431 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1432 /* Compute the queue that is the best fit for npages. */ 1433 order = flsl(npages - 1); 1434 /* Search for a run satisfying the specified conditions. */ 1435 size = npages << PAGE_SHIFT; 1436 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; 1437 oind++) { 1438 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1439 fl = (*seg->free_queues)[pind]; 1440 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1441 /* 1442 * Is the size of this allocation request 1443 * larger than the largest block size? 1444 */ 1445 if (order >= VM_NFREEORDER) { 1446 /* 1447 * Determine if a sufficient number of 1448 * subsequent blocks to satisfy the 1449 * allocation request are free. 1450 */ 1451 pa = VM_PAGE_TO_PHYS(m_ret); 1452 pa_end = pa + size; 1453 if (pa_end < pa) 1454 continue; 1455 for (;;) { 1456 pa += 1 << (PAGE_SHIFT + 1457 VM_NFREEORDER - 1); 1458 if (pa >= pa_end || 1459 pa < seg->start || 1460 pa >= seg->end) 1461 break; 1462 m = &seg->first_page[atop(pa - 1463 seg->start)]; 1464 if (m->order != VM_NFREEORDER - 1465 1) 1466 break; 1467 } 1468 /* If not, go to the next block. */ 1469 if (pa < pa_end) 1470 continue; 1471 } 1472 1473 /* 1474 * Determine if the blocks are within the 1475 * given range, satisfy the given alignment, 1476 * and do not cross the given boundary. 1477 */ 1478 pa = VM_PAGE_TO_PHYS(m_ret); 1479 pa_end = pa + size; 1480 if (pa >= low && pa_end <= high && 1481 (pa & (alignment - 1)) == 0 && 1482 rounddown2(pa ^ (pa_end - 1), boundary) == 0) 1483 goto done; 1484 } 1485 } 1486 } 1487 return (NULL); 1488 done: 1489 for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { 1490 fl = (*seg->free_queues)[m->pool]; 1491 vm_freelist_rem(fl, m, oind); 1492 if (m->pool != VM_FREEPOOL_DEFAULT) 1493 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1494 } 1495 /* Return excess pages to the free lists. */ 1496 npages_end = roundup2(npages, 1 << oind); 1497 if (npages < npages_end) { 1498 fl = (*seg->free_queues)[VM_FREEPOOL_DEFAULT]; 1499 vm_phys_enq_range(&m_ret[npages], npages_end - npages, fl, 0); 1500 } 1501 return (m_ret); 1502 } 1503 1504 /* 1505 * Return the index of the first unused slot which may be the terminating 1506 * entry. 1507 */ 1508 static int 1509 vm_phys_avail_count(void) 1510 { 1511 int i; 1512 1513 for (i = 0; phys_avail[i + 1]; i += 2) 1514 continue; 1515 if (i > PHYS_AVAIL_ENTRIES) 1516 panic("Improperly terminated phys_avail %d entries", i); 1517 1518 return (i); 1519 } 1520 1521 /* 1522 * Assert that a phys_avail entry is valid. 1523 */ 1524 static void 1525 vm_phys_avail_check(int i) 1526 { 1527 if (phys_avail[i] & PAGE_MASK) 1528 panic("Unaligned phys_avail[%d]: %#jx", i, 1529 (intmax_t)phys_avail[i]); 1530 if (phys_avail[i+1] & PAGE_MASK) 1531 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1532 (intmax_t)phys_avail[i]); 1533 if (phys_avail[i + 1] < phys_avail[i]) 1534 panic("phys_avail[%d] start %#jx < end %#jx", i, 1535 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1536 } 1537 1538 /* 1539 * Return the index of an overlapping phys_avail entry or -1. 1540 */ 1541 #ifdef NUMA 1542 static int 1543 vm_phys_avail_find(vm_paddr_t pa) 1544 { 1545 int i; 1546 1547 for (i = 0; phys_avail[i + 1]; i += 2) 1548 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1549 return (i); 1550 return (-1); 1551 } 1552 #endif 1553 1554 /* 1555 * Return the index of the largest entry. 1556 */ 1557 int 1558 vm_phys_avail_largest(void) 1559 { 1560 vm_paddr_t sz, largesz; 1561 int largest; 1562 int i; 1563 1564 largest = 0; 1565 largesz = 0; 1566 for (i = 0; phys_avail[i + 1]; i += 2) { 1567 sz = vm_phys_avail_size(i); 1568 if (sz > largesz) { 1569 largesz = sz; 1570 largest = i; 1571 } 1572 } 1573 1574 return (largest); 1575 } 1576 1577 vm_paddr_t 1578 vm_phys_avail_size(int i) 1579 { 1580 1581 return (phys_avail[i + 1] - phys_avail[i]); 1582 } 1583 1584 /* 1585 * Split an entry at the address 'pa'. Return zero on success or errno. 1586 */ 1587 static int 1588 vm_phys_avail_split(vm_paddr_t pa, int i) 1589 { 1590 int cnt; 1591 1592 vm_phys_avail_check(i); 1593 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1594 panic("vm_phys_avail_split: invalid address"); 1595 cnt = vm_phys_avail_count(); 1596 if (cnt >= PHYS_AVAIL_ENTRIES) 1597 return (ENOSPC); 1598 memmove(&phys_avail[i + 2], &phys_avail[i], 1599 (cnt - i) * sizeof(phys_avail[0])); 1600 phys_avail[i + 1] = pa; 1601 phys_avail[i + 2] = pa; 1602 vm_phys_avail_check(i); 1603 vm_phys_avail_check(i+2); 1604 1605 return (0); 1606 } 1607 1608 /* 1609 * This routine allocates NUMA node specific memory before the page 1610 * allocator is bootstrapped. 1611 */ 1612 vm_paddr_t 1613 vm_phys_early_alloc(int domain, size_t alloc_size) 1614 { 1615 int i, mem_index, biggestone; 1616 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1617 1618 1619 /* 1620 * Search the mem_affinity array for the biggest address 1621 * range in the desired domain. This is used to constrain 1622 * the phys_avail selection below. 1623 */ 1624 biggestsize = 0; 1625 mem_index = 0; 1626 mem_start = 0; 1627 mem_end = -1; 1628 #ifdef NUMA 1629 if (mem_affinity != NULL) { 1630 for (i = 0; ; i++) { 1631 size = mem_affinity[i].end - mem_affinity[i].start; 1632 if (size == 0) 1633 break; 1634 if (mem_affinity[i].domain != domain) 1635 continue; 1636 if (size > biggestsize) { 1637 mem_index = i; 1638 biggestsize = size; 1639 } 1640 } 1641 mem_start = mem_affinity[mem_index].start; 1642 mem_end = mem_affinity[mem_index].end; 1643 } 1644 #endif 1645 1646 /* 1647 * Now find biggest physical segment in within the desired 1648 * numa domain. 1649 */ 1650 biggestsize = 0; 1651 biggestone = 0; 1652 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1653 /* skip regions that are out of range */ 1654 if (phys_avail[i+1] - alloc_size < mem_start || 1655 phys_avail[i+1] > mem_end) 1656 continue; 1657 size = vm_phys_avail_size(i); 1658 if (size > biggestsize) { 1659 biggestone = i; 1660 biggestsize = size; 1661 } 1662 } 1663 alloc_size = round_page(alloc_size); 1664 1665 /* 1666 * Grab single pages from the front to reduce fragmentation. 1667 */ 1668 if (alloc_size == PAGE_SIZE) { 1669 pa = phys_avail[biggestone]; 1670 phys_avail[biggestone] += PAGE_SIZE; 1671 vm_phys_avail_check(biggestone); 1672 return (pa); 1673 } 1674 1675 /* 1676 * Naturally align large allocations. 1677 */ 1678 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1679 if (alloc_size + align > biggestsize) 1680 panic("cannot find a large enough size\n"); 1681 if (align != 0 && 1682 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1683 biggestone) != 0) 1684 /* Wasting memory. */ 1685 phys_avail[biggestone + 1] -= align; 1686 1687 phys_avail[biggestone + 1] -= alloc_size; 1688 vm_phys_avail_check(biggestone); 1689 pa = phys_avail[biggestone + 1]; 1690 return (pa); 1691 } 1692 1693 void 1694 vm_phys_early_startup(void) 1695 { 1696 int i; 1697 1698 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1699 phys_avail[i] = round_page(phys_avail[i]); 1700 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1701 } 1702 1703 #ifdef NUMA 1704 /* Force phys_avail to be split by domain. */ 1705 if (mem_affinity != NULL) { 1706 int idx; 1707 1708 for (i = 0; mem_affinity[i].end != 0; i++) { 1709 idx = vm_phys_avail_find(mem_affinity[i].start); 1710 if (idx != -1 && 1711 phys_avail[idx] != mem_affinity[i].start) 1712 vm_phys_avail_split(mem_affinity[i].start, idx); 1713 idx = vm_phys_avail_find(mem_affinity[i].end); 1714 if (idx != -1 && 1715 phys_avail[idx] != mem_affinity[i].end) 1716 vm_phys_avail_split(mem_affinity[i].end, idx); 1717 } 1718 } 1719 #endif 1720 } 1721 1722 #ifdef DDB 1723 /* 1724 * Show the number of physical pages in each of the free lists. 1725 */ 1726 DB_SHOW_COMMAND(freepages, db_show_freepages) 1727 { 1728 struct vm_freelist *fl; 1729 int flind, oind, pind, dom; 1730 1731 for (dom = 0; dom < vm_ndomains; dom++) { 1732 db_printf("DOMAIN: %d\n", dom); 1733 for (flind = 0; flind < vm_nfreelists; flind++) { 1734 db_printf("FREE LIST %d:\n" 1735 "\n ORDER (SIZE) | NUMBER" 1736 "\n ", flind); 1737 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1738 db_printf(" | POOL %d", pind); 1739 db_printf("\n-- "); 1740 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1741 db_printf("-- -- "); 1742 db_printf("--\n"); 1743 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 1744 db_printf(" %2.2d (%6.6dK)", oind, 1745 1 << (PAGE_SHIFT - 10 + oind)); 1746 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1747 fl = vm_phys_free_queues[dom][flind][pind]; 1748 db_printf(" | %6.6d", fl[oind].lcnt); 1749 } 1750 db_printf("\n"); 1751 } 1752 db_printf("\n"); 1753 } 1754 db_printf("\n"); 1755 } 1756 } 1757 #endif 1758