1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ddb.h" 45 #include "opt_vm.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/domainset.h> 50 #include <sys/lock.h> 51 #include <sys/kernel.h> 52 #include <sys/malloc.h> 53 #include <sys/mutex.h> 54 #include <sys/proc.h> 55 #include <sys/queue.h> 56 #include <sys/rwlock.h> 57 #include <sys/sbuf.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/vmmeter.h> 61 62 #include <ddb/ddb.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_param.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_page.h> 69 #include <vm/vm_phys.h> 70 #include <vm/vm_pagequeue.h> 71 72 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 73 "Too many physsegs."); 74 75 #ifdef NUMA 76 struct mem_affinity __read_mostly *mem_affinity; 77 int __read_mostly *mem_locality; 78 #endif 79 80 int __read_mostly vm_ndomains = 1; 81 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 82 83 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 84 int __read_mostly vm_phys_nsegs; 85 static struct vm_phys_seg vm_phys_early_segs[8]; 86 static int vm_phys_early_nsegs; 87 88 struct vm_phys_fictitious_seg; 89 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 90 struct vm_phys_fictitious_seg *); 91 92 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 93 RB_INITIALIZER(&vm_phys_fictitious_tree); 94 95 struct vm_phys_fictitious_seg { 96 RB_ENTRY(vm_phys_fictitious_seg) node; 97 /* Memory region data */ 98 vm_paddr_t start; 99 vm_paddr_t end; 100 vm_page_t first_page; 101 }; 102 103 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 104 vm_phys_fictitious_cmp); 105 106 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 107 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 108 109 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 110 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 111 [VM_NFREEORDER_MAX]; 112 113 static int __read_mostly vm_nfreelists; 114 115 /* 116 * These "avail lists" are globals used to communicate boot-time physical 117 * memory layout to other parts of the kernel. Each physically contiguous 118 * region of memory is defined by a start address at an even index and an 119 * end address at the following odd index. Each list is terminated by a 120 * pair of zero entries. 121 * 122 * dump_avail tells the dump code what regions to include in a crash dump, and 123 * phys_avail is all of the remaining physical memory that is available for 124 * the vm system. 125 * 126 * Initially dump_avail and phys_avail are identical. Boot time memory 127 * allocations remove extents from phys_avail that may still be included 128 * in dumps. 129 */ 130 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 131 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 132 133 /* 134 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 135 */ 136 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 137 138 CTASSERT(VM_FREELIST_DEFAULT == 0); 139 140 #ifdef VM_FREELIST_DMA32 141 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 142 #endif 143 144 /* 145 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 146 * the ordering of the free list boundaries. 147 */ 148 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 149 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 150 #endif 151 152 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 153 SYSCTL_OID(_vm, OID_AUTO, phys_free, 154 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 155 sysctl_vm_phys_free, "A", 156 "Phys Free Info"); 157 158 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 159 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 160 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 161 sysctl_vm_phys_segs, "A", 162 "Phys Seg Info"); 163 164 #ifdef NUMA 165 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 166 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 167 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 168 sysctl_vm_phys_locality, "A", 169 "Phys Locality Info"); 170 #endif 171 172 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 173 &vm_ndomains, 0, "Number of physical memory domains available."); 174 175 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, 176 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 177 vm_paddr_t boundary); 178 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 179 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 180 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 181 int order, int tail); 182 183 /* 184 * Red-black tree helpers for vm fictitious range management. 185 */ 186 static inline int 187 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 188 struct vm_phys_fictitious_seg *range) 189 { 190 191 KASSERT(range->start != 0 && range->end != 0, 192 ("Invalid range passed on search for vm_fictitious page")); 193 if (p->start >= range->end) 194 return (1); 195 if (p->start < range->start) 196 return (-1); 197 198 return (0); 199 } 200 201 static int 202 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 203 struct vm_phys_fictitious_seg *p2) 204 { 205 206 /* Check if this is a search for a page */ 207 if (p1->end == 0) 208 return (vm_phys_fictitious_in_range(p1, p2)); 209 210 KASSERT(p2->end != 0, 211 ("Invalid range passed as second parameter to vm fictitious comparison")); 212 213 /* Searching to add a new range */ 214 if (p1->end <= p2->start) 215 return (-1); 216 if (p1->start >= p2->end) 217 return (1); 218 219 panic("Trying to add overlapping vm fictitious ranges:\n" 220 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 221 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 222 } 223 224 int 225 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high) 226 { 227 #ifdef NUMA 228 domainset_t mask; 229 int i; 230 231 if (vm_ndomains == 1 || mem_affinity == NULL) 232 return (0); 233 234 DOMAINSET_ZERO(&mask); 235 /* 236 * Check for any memory that overlaps low, high. 237 */ 238 for (i = 0; mem_affinity[i].end != 0; i++) 239 if (mem_affinity[i].start <= high && 240 mem_affinity[i].end >= low) 241 DOMAINSET_SET(mem_affinity[i].domain, &mask); 242 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 243 return (prefer); 244 if (DOMAINSET_EMPTY(&mask)) 245 panic("vm_phys_domain_match: Impossible constraint"); 246 return (DOMAINSET_FFS(&mask) - 1); 247 #else 248 return (0); 249 #endif 250 } 251 252 /* 253 * Outputs the state of the physical memory allocator, specifically, 254 * the amount of physical memory in each free list. 255 */ 256 static int 257 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 258 { 259 struct sbuf sbuf; 260 struct vm_freelist *fl; 261 int dom, error, flind, oind, pind; 262 263 error = sysctl_wire_old_buffer(req, 0); 264 if (error != 0) 265 return (error); 266 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 267 for (dom = 0; dom < vm_ndomains; dom++) { 268 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 269 for (flind = 0; flind < vm_nfreelists; flind++) { 270 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 271 "\n ORDER (SIZE) | NUMBER" 272 "\n ", flind); 273 for (pind = 0; pind < VM_NFREEPOOL; pind++) 274 sbuf_printf(&sbuf, " | POOL %d", pind); 275 sbuf_printf(&sbuf, "\n-- "); 276 for (pind = 0; pind < VM_NFREEPOOL; pind++) 277 sbuf_printf(&sbuf, "-- -- "); 278 sbuf_printf(&sbuf, "--\n"); 279 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 280 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 281 1 << (PAGE_SHIFT - 10 + oind)); 282 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 283 fl = vm_phys_free_queues[dom][flind][pind]; 284 sbuf_printf(&sbuf, " | %6d", 285 fl[oind].lcnt); 286 } 287 sbuf_printf(&sbuf, "\n"); 288 } 289 } 290 } 291 error = sbuf_finish(&sbuf); 292 sbuf_delete(&sbuf); 293 return (error); 294 } 295 296 /* 297 * Outputs the set of physical memory segments. 298 */ 299 static int 300 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 301 { 302 struct sbuf sbuf; 303 struct vm_phys_seg *seg; 304 int error, segind; 305 306 error = sysctl_wire_old_buffer(req, 0); 307 if (error != 0) 308 return (error); 309 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 310 for (segind = 0; segind < vm_phys_nsegs; segind++) { 311 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 312 seg = &vm_phys_segs[segind]; 313 sbuf_printf(&sbuf, "start: %#jx\n", 314 (uintmax_t)seg->start); 315 sbuf_printf(&sbuf, "end: %#jx\n", 316 (uintmax_t)seg->end); 317 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 318 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 319 } 320 error = sbuf_finish(&sbuf); 321 sbuf_delete(&sbuf); 322 return (error); 323 } 324 325 /* 326 * Return affinity, or -1 if there's no affinity information. 327 */ 328 int 329 vm_phys_mem_affinity(int f, int t) 330 { 331 332 #ifdef NUMA 333 if (mem_locality == NULL) 334 return (-1); 335 if (f >= vm_ndomains || t >= vm_ndomains) 336 return (-1); 337 return (mem_locality[f * vm_ndomains + t]); 338 #else 339 return (-1); 340 #endif 341 } 342 343 #ifdef NUMA 344 /* 345 * Outputs the VM locality table. 346 */ 347 static int 348 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 349 { 350 struct sbuf sbuf; 351 int error, i, j; 352 353 error = sysctl_wire_old_buffer(req, 0); 354 if (error != 0) 355 return (error); 356 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 357 358 sbuf_printf(&sbuf, "\n"); 359 360 for (i = 0; i < vm_ndomains; i++) { 361 sbuf_printf(&sbuf, "%d: ", i); 362 for (j = 0; j < vm_ndomains; j++) { 363 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 364 } 365 sbuf_printf(&sbuf, "\n"); 366 } 367 error = sbuf_finish(&sbuf); 368 sbuf_delete(&sbuf); 369 return (error); 370 } 371 #endif 372 373 static void 374 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 375 { 376 377 m->order = order; 378 if (tail) 379 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 380 else 381 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 382 fl[order].lcnt++; 383 } 384 385 static void 386 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 387 { 388 389 TAILQ_REMOVE(&fl[order].pl, m, listq); 390 fl[order].lcnt--; 391 m->order = VM_NFREEORDER; 392 } 393 394 /* 395 * Create a physical memory segment. 396 */ 397 static void 398 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 399 { 400 struct vm_phys_seg *seg; 401 402 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 403 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 404 KASSERT(domain >= 0 && domain < vm_ndomains, 405 ("vm_phys_create_seg: invalid domain provided")); 406 seg = &vm_phys_segs[vm_phys_nsegs++]; 407 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 408 *seg = *(seg - 1); 409 seg--; 410 } 411 seg->start = start; 412 seg->end = end; 413 seg->domain = domain; 414 } 415 416 static void 417 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 418 { 419 #ifdef NUMA 420 int i; 421 422 if (mem_affinity == NULL) { 423 _vm_phys_create_seg(start, end, 0); 424 return; 425 } 426 427 for (i = 0;; i++) { 428 if (mem_affinity[i].end == 0) 429 panic("Reached end of affinity info"); 430 if (mem_affinity[i].end <= start) 431 continue; 432 if (mem_affinity[i].start > start) 433 panic("No affinity info for start %jx", 434 (uintmax_t)start); 435 if (mem_affinity[i].end >= end) { 436 _vm_phys_create_seg(start, end, 437 mem_affinity[i].domain); 438 break; 439 } 440 _vm_phys_create_seg(start, mem_affinity[i].end, 441 mem_affinity[i].domain); 442 start = mem_affinity[i].end; 443 } 444 #else 445 _vm_phys_create_seg(start, end, 0); 446 #endif 447 } 448 449 /* 450 * Add a physical memory segment. 451 */ 452 void 453 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 454 { 455 vm_paddr_t paddr; 456 457 KASSERT((start & PAGE_MASK) == 0, 458 ("vm_phys_define_seg: start is not page aligned")); 459 KASSERT((end & PAGE_MASK) == 0, 460 ("vm_phys_define_seg: end is not page aligned")); 461 462 /* 463 * Split the physical memory segment if it spans two or more free 464 * list boundaries. 465 */ 466 paddr = start; 467 #ifdef VM_FREELIST_LOWMEM 468 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 469 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 470 paddr = VM_LOWMEM_BOUNDARY; 471 } 472 #endif 473 #ifdef VM_FREELIST_DMA32 474 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 475 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 476 paddr = VM_DMA32_BOUNDARY; 477 } 478 #endif 479 vm_phys_create_seg(paddr, end); 480 } 481 482 /* 483 * Initialize the physical memory allocator. 484 * 485 * Requires that vm_page_array is initialized! 486 */ 487 void 488 vm_phys_init(void) 489 { 490 struct vm_freelist *fl; 491 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 492 u_long npages; 493 int dom, flind, freelist, oind, pind, segind; 494 495 /* 496 * Compute the number of free lists, and generate the mapping from the 497 * manifest constants VM_FREELIST_* to the free list indices. 498 * 499 * Initially, the entries of vm_freelist_to_flind[] are set to either 500 * 0 or 1 to indicate which free lists should be created. 501 */ 502 npages = 0; 503 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 504 seg = &vm_phys_segs[segind]; 505 #ifdef VM_FREELIST_LOWMEM 506 if (seg->end <= VM_LOWMEM_BOUNDARY) 507 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 508 else 509 #endif 510 #ifdef VM_FREELIST_DMA32 511 if ( 512 #ifdef VM_DMA32_NPAGES_THRESHOLD 513 /* 514 * Create the DMA32 free list only if the amount of 515 * physical memory above physical address 4G exceeds the 516 * given threshold. 517 */ 518 npages > VM_DMA32_NPAGES_THRESHOLD && 519 #endif 520 seg->end <= VM_DMA32_BOUNDARY) 521 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 522 else 523 #endif 524 { 525 npages += atop(seg->end - seg->start); 526 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 527 } 528 } 529 /* Change each entry into a running total of the free lists. */ 530 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 531 vm_freelist_to_flind[freelist] += 532 vm_freelist_to_flind[freelist - 1]; 533 } 534 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 535 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 536 /* Change each entry into a free list index. */ 537 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 538 vm_freelist_to_flind[freelist]--; 539 540 /* 541 * Initialize the first_page and free_queues fields of each physical 542 * memory segment. 543 */ 544 #ifdef VM_PHYSSEG_SPARSE 545 npages = 0; 546 #endif 547 for (segind = 0; segind < vm_phys_nsegs; segind++) { 548 seg = &vm_phys_segs[segind]; 549 #ifdef VM_PHYSSEG_SPARSE 550 seg->first_page = &vm_page_array[npages]; 551 npages += atop(seg->end - seg->start); 552 #else 553 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 554 #endif 555 #ifdef VM_FREELIST_LOWMEM 556 if (seg->end <= VM_LOWMEM_BOUNDARY) { 557 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 558 KASSERT(flind >= 0, 559 ("vm_phys_init: LOWMEM flind < 0")); 560 } else 561 #endif 562 #ifdef VM_FREELIST_DMA32 563 if (seg->end <= VM_DMA32_BOUNDARY) { 564 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 565 KASSERT(flind >= 0, 566 ("vm_phys_init: DMA32 flind < 0")); 567 } else 568 #endif 569 { 570 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 571 KASSERT(flind >= 0, 572 ("vm_phys_init: DEFAULT flind < 0")); 573 } 574 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 575 } 576 577 /* 578 * Coalesce physical memory segments that are contiguous and share the 579 * same per-domain free queues. 580 */ 581 prev_seg = vm_phys_segs; 582 seg = &vm_phys_segs[1]; 583 end_seg = &vm_phys_segs[vm_phys_nsegs]; 584 while (seg < end_seg) { 585 if (prev_seg->end == seg->start && 586 prev_seg->free_queues == seg->free_queues) { 587 prev_seg->end = seg->end; 588 KASSERT(prev_seg->domain == seg->domain, 589 ("vm_phys_init: free queues cannot span domains")); 590 vm_phys_nsegs--; 591 end_seg--; 592 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 593 *tmp_seg = *(tmp_seg + 1); 594 } else { 595 prev_seg = seg; 596 seg++; 597 } 598 } 599 600 /* 601 * Initialize the free queues. 602 */ 603 for (dom = 0; dom < vm_ndomains; dom++) { 604 for (flind = 0; flind < vm_nfreelists; flind++) { 605 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 606 fl = vm_phys_free_queues[dom][flind][pind]; 607 for (oind = 0; oind < VM_NFREEORDER; oind++) 608 TAILQ_INIT(&fl[oind].pl); 609 } 610 } 611 } 612 613 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 614 } 615 616 /* 617 * Register info about the NUMA topology of the system. 618 * 619 * Invoked by platform-dependent code prior to vm_phys_init(). 620 */ 621 void 622 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, 623 int *locality) 624 { 625 #ifdef NUMA 626 int d, i; 627 628 /* 629 * For now the only override value that we support is 1, which 630 * effectively disables NUMA-awareness in the allocators. 631 */ 632 d = 0; 633 TUNABLE_INT_FETCH("vm.numa.disabled", &d); 634 if (d) 635 ndomains = 1; 636 637 if (ndomains > 1) { 638 vm_ndomains = ndomains; 639 mem_affinity = affinity; 640 mem_locality = locality; 641 } 642 643 for (i = 0; i < vm_ndomains; i++) 644 DOMAINSET_SET(i, &all_domains); 645 #else 646 (void)ndomains; 647 (void)affinity; 648 (void)locality; 649 #endif 650 } 651 652 /* 653 * Split a contiguous, power of two-sized set of physical pages. 654 * 655 * When this function is called by a page allocation function, the caller 656 * should request insertion at the head unless the order [order, oind) queues 657 * are known to be empty. The objective being to reduce the likelihood of 658 * long-term fragmentation by promoting contemporaneous allocation and 659 * (hopefully) deallocation. 660 */ 661 static __inline void 662 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 663 int tail) 664 { 665 vm_page_t m_buddy; 666 667 while (oind > order) { 668 oind--; 669 m_buddy = &m[1 << oind]; 670 KASSERT(m_buddy->order == VM_NFREEORDER, 671 ("vm_phys_split_pages: page %p has unexpected order %d", 672 m_buddy, m_buddy->order)); 673 vm_freelist_add(fl, m_buddy, oind, tail); 674 } 675 } 676 677 /* 678 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 679 * and sized set to the specified free list. 680 * 681 * When this function is called by a page allocation function, the caller 682 * should request insertion at the head unless the lower-order queues are 683 * known to be empty. The objective being to reduce the likelihood of long- 684 * term fragmentation by promoting contemporaneous allocation and (hopefully) 685 * deallocation. 686 * 687 * The physical page m's buddy must not be free. 688 */ 689 static void 690 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 691 { 692 u_int n; 693 int order; 694 695 KASSERT(npages > 0, ("vm_phys_enq_range: npages is 0")); 696 KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 697 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 698 ("vm_phys_enq_range: page %p and npages %u are misaligned", 699 m, npages)); 700 do { 701 KASSERT(m->order == VM_NFREEORDER, 702 ("vm_phys_enq_range: page %p has unexpected order %d", 703 m, m->order)); 704 order = ffs(npages) - 1; 705 KASSERT(order < VM_NFREEORDER, 706 ("vm_phys_enq_range: order %d is out of range", order)); 707 vm_freelist_add(fl, m, order, tail); 708 n = 1 << order; 709 m += n; 710 npages -= n; 711 } while (npages > 0); 712 } 713 714 /* 715 * Tries to allocate the specified number of pages from the specified pool 716 * within the specified domain. Returns the actual number of allocated pages 717 * and a pointer to each page through the array ma[]. 718 * 719 * The returned pages may not be physically contiguous. However, in contrast 720 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 721 * calling this function once to allocate the desired number of pages will 722 * avoid wasted time in vm_phys_split_pages(). 723 * 724 * The free page queues for the specified domain must be locked. 725 */ 726 int 727 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 728 { 729 struct vm_freelist *alt, *fl; 730 vm_page_t m; 731 int avail, end, flind, freelist, i, need, oind, pind; 732 733 KASSERT(domain >= 0 && domain < vm_ndomains, 734 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 735 KASSERT(pool < VM_NFREEPOOL, 736 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 737 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 738 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 739 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 740 i = 0; 741 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 742 flind = vm_freelist_to_flind[freelist]; 743 if (flind < 0) 744 continue; 745 fl = vm_phys_free_queues[domain][flind][pool]; 746 for (oind = 0; oind < VM_NFREEORDER; oind++) { 747 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 748 vm_freelist_rem(fl, m, oind); 749 avail = 1 << oind; 750 need = imin(npages - i, avail); 751 for (end = i + need; i < end;) 752 ma[i++] = m++; 753 if (need < avail) { 754 /* 755 * Return excess pages to fl. Its 756 * order [0, oind) queues are empty. 757 */ 758 vm_phys_enq_range(m, avail - need, fl, 759 1); 760 return (npages); 761 } else if (i == npages) 762 return (npages); 763 } 764 } 765 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 766 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 767 alt = vm_phys_free_queues[domain][flind][pind]; 768 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 769 NULL) { 770 vm_freelist_rem(alt, m, oind); 771 vm_phys_set_pool(pool, m, oind); 772 avail = 1 << oind; 773 need = imin(npages - i, avail); 774 for (end = i + need; i < end;) 775 ma[i++] = m++; 776 if (need < avail) { 777 /* 778 * Return excess pages to fl. 779 * Its order [0, oind) queues 780 * are empty. 781 */ 782 vm_phys_enq_range(m, avail - 783 need, fl, 1); 784 return (npages); 785 } else if (i == npages) 786 return (npages); 787 } 788 } 789 } 790 } 791 return (i); 792 } 793 794 /* 795 * Allocate a contiguous, power of two-sized set of physical pages 796 * from the free lists. 797 * 798 * The free page queues must be locked. 799 */ 800 vm_page_t 801 vm_phys_alloc_pages(int domain, int pool, int order) 802 { 803 vm_page_t m; 804 int freelist; 805 806 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 807 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 808 if (m != NULL) 809 return (m); 810 } 811 return (NULL); 812 } 813 814 /* 815 * Allocate a contiguous, power of two-sized set of physical pages from the 816 * specified free list. The free list must be specified using one of the 817 * manifest constants VM_FREELIST_*. 818 * 819 * The free page queues must be locked. 820 */ 821 vm_page_t 822 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 823 { 824 struct vm_freelist *alt, *fl; 825 vm_page_t m; 826 int oind, pind, flind; 827 828 KASSERT(domain >= 0 && domain < vm_ndomains, 829 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 830 domain)); 831 KASSERT(freelist < VM_NFREELIST, 832 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 833 freelist)); 834 KASSERT(pool < VM_NFREEPOOL, 835 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 836 KASSERT(order < VM_NFREEORDER, 837 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 838 839 flind = vm_freelist_to_flind[freelist]; 840 /* Check if freelist is present */ 841 if (flind < 0) 842 return (NULL); 843 844 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 845 fl = &vm_phys_free_queues[domain][flind][pool][0]; 846 for (oind = order; oind < VM_NFREEORDER; oind++) { 847 m = TAILQ_FIRST(&fl[oind].pl); 848 if (m != NULL) { 849 vm_freelist_rem(fl, m, oind); 850 /* The order [order, oind) queues are empty. */ 851 vm_phys_split_pages(m, oind, fl, order, 1); 852 return (m); 853 } 854 } 855 856 /* 857 * The given pool was empty. Find the largest 858 * contiguous, power-of-two-sized set of pages in any 859 * pool. Transfer these pages to the given pool, and 860 * use them to satisfy the allocation. 861 */ 862 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 863 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 864 alt = &vm_phys_free_queues[domain][flind][pind][0]; 865 m = TAILQ_FIRST(&alt[oind].pl); 866 if (m != NULL) { 867 vm_freelist_rem(alt, m, oind); 868 vm_phys_set_pool(pool, m, oind); 869 /* The order [order, oind) queues are empty. */ 870 vm_phys_split_pages(m, oind, fl, order, 1); 871 return (m); 872 } 873 } 874 } 875 return (NULL); 876 } 877 878 /* 879 * Find the vm_page corresponding to the given physical address. 880 */ 881 vm_page_t 882 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 883 { 884 struct vm_phys_seg *seg; 885 int segind; 886 887 for (segind = 0; segind < vm_phys_nsegs; segind++) { 888 seg = &vm_phys_segs[segind]; 889 if (pa >= seg->start && pa < seg->end) 890 return (&seg->first_page[atop(pa - seg->start)]); 891 } 892 return (NULL); 893 } 894 895 vm_page_t 896 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 897 { 898 struct vm_phys_fictitious_seg tmp, *seg; 899 vm_page_t m; 900 901 m = NULL; 902 tmp.start = pa; 903 tmp.end = 0; 904 905 rw_rlock(&vm_phys_fictitious_reg_lock); 906 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 907 rw_runlock(&vm_phys_fictitious_reg_lock); 908 if (seg == NULL) 909 return (NULL); 910 911 m = &seg->first_page[atop(pa - seg->start)]; 912 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 913 914 return (m); 915 } 916 917 static inline void 918 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 919 long page_count, vm_memattr_t memattr) 920 { 921 long i; 922 923 bzero(range, page_count * sizeof(*range)); 924 for (i = 0; i < page_count; i++) { 925 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 926 range[i].oflags &= ~VPO_UNMANAGED; 927 range[i].busy_lock = VPB_UNBUSIED; 928 } 929 } 930 931 int 932 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 933 vm_memattr_t memattr) 934 { 935 struct vm_phys_fictitious_seg *seg; 936 vm_page_t fp; 937 long page_count; 938 #ifdef VM_PHYSSEG_DENSE 939 long pi, pe; 940 long dpage_count; 941 #endif 942 943 KASSERT(start < end, 944 ("Start of segment isn't less than end (start: %jx end: %jx)", 945 (uintmax_t)start, (uintmax_t)end)); 946 947 page_count = (end - start) / PAGE_SIZE; 948 949 #ifdef VM_PHYSSEG_DENSE 950 pi = atop(start); 951 pe = atop(end); 952 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 953 fp = &vm_page_array[pi - first_page]; 954 if ((pe - first_page) > vm_page_array_size) { 955 /* 956 * We have a segment that starts inside 957 * of vm_page_array, but ends outside of it. 958 * 959 * Use vm_page_array pages for those that are 960 * inside of the vm_page_array range, and 961 * allocate the remaining ones. 962 */ 963 dpage_count = vm_page_array_size - (pi - first_page); 964 vm_phys_fictitious_init_range(fp, start, dpage_count, 965 memattr); 966 page_count -= dpage_count; 967 start += ptoa(dpage_count); 968 goto alloc; 969 } 970 /* 971 * We can allocate the full range from vm_page_array, 972 * so there's no need to register the range in the tree. 973 */ 974 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 975 return (0); 976 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 977 /* 978 * We have a segment that ends inside of vm_page_array, 979 * but starts outside of it. 980 */ 981 fp = &vm_page_array[0]; 982 dpage_count = pe - first_page; 983 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 984 memattr); 985 end -= ptoa(dpage_count); 986 page_count -= dpage_count; 987 goto alloc; 988 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 989 /* 990 * Trying to register a fictitious range that expands before 991 * and after vm_page_array. 992 */ 993 return (EINVAL); 994 } else { 995 alloc: 996 #endif 997 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 998 M_WAITOK); 999 #ifdef VM_PHYSSEG_DENSE 1000 } 1001 #endif 1002 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1003 1004 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1005 seg->start = start; 1006 seg->end = end; 1007 seg->first_page = fp; 1008 1009 rw_wlock(&vm_phys_fictitious_reg_lock); 1010 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1011 rw_wunlock(&vm_phys_fictitious_reg_lock); 1012 1013 return (0); 1014 } 1015 1016 void 1017 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1018 { 1019 struct vm_phys_fictitious_seg *seg, tmp; 1020 #ifdef VM_PHYSSEG_DENSE 1021 long pi, pe; 1022 #endif 1023 1024 KASSERT(start < end, 1025 ("Start of segment isn't less than end (start: %jx end: %jx)", 1026 (uintmax_t)start, (uintmax_t)end)); 1027 1028 #ifdef VM_PHYSSEG_DENSE 1029 pi = atop(start); 1030 pe = atop(end); 1031 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1032 if ((pe - first_page) <= vm_page_array_size) { 1033 /* 1034 * This segment was allocated using vm_page_array 1035 * only, there's nothing to do since those pages 1036 * were never added to the tree. 1037 */ 1038 return; 1039 } 1040 /* 1041 * We have a segment that starts inside 1042 * of vm_page_array, but ends outside of it. 1043 * 1044 * Calculate how many pages were added to the 1045 * tree and free them. 1046 */ 1047 start = ptoa(first_page + vm_page_array_size); 1048 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1049 /* 1050 * We have a segment that ends inside of vm_page_array, 1051 * but starts outside of it. 1052 */ 1053 end = ptoa(first_page); 1054 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1055 /* Since it's not possible to register such a range, panic. */ 1056 panic( 1057 "Unregistering not registered fictitious range [%#jx:%#jx]", 1058 (uintmax_t)start, (uintmax_t)end); 1059 } 1060 #endif 1061 tmp.start = start; 1062 tmp.end = 0; 1063 1064 rw_wlock(&vm_phys_fictitious_reg_lock); 1065 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1066 if (seg->start != start || seg->end != end) { 1067 rw_wunlock(&vm_phys_fictitious_reg_lock); 1068 panic( 1069 "Unregistering not registered fictitious range [%#jx:%#jx]", 1070 (uintmax_t)start, (uintmax_t)end); 1071 } 1072 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1073 rw_wunlock(&vm_phys_fictitious_reg_lock); 1074 free(seg->first_page, M_FICT_PAGES); 1075 free(seg, M_FICT_PAGES); 1076 } 1077 1078 /* 1079 * Free a contiguous, power of two-sized set of physical pages. 1080 * 1081 * The free page queues must be locked. 1082 */ 1083 void 1084 vm_phys_free_pages(vm_page_t m, int order) 1085 { 1086 struct vm_freelist *fl; 1087 struct vm_phys_seg *seg; 1088 vm_paddr_t pa; 1089 vm_page_t m_buddy; 1090 1091 KASSERT(m->order == VM_NFREEORDER, 1092 ("vm_phys_free_pages: page %p has unexpected order %d", 1093 m, m->order)); 1094 KASSERT(m->pool < VM_NFREEPOOL, 1095 ("vm_phys_free_pages: page %p has unexpected pool %d", 1096 m, m->pool)); 1097 KASSERT(order < VM_NFREEORDER, 1098 ("vm_phys_free_pages: order %d is out of range", order)); 1099 seg = &vm_phys_segs[m->segind]; 1100 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1101 if (order < VM_NFREEORDER - 1) { 1102 pa = VM_PAGE_TO_PHYS(m); 1103 do { 1104 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1105 if (pa < seg->start || pa >= seg->end) 1106 break; 1107 m_buddy = &seg->first_page[atop(pa - seg->start)]; 1108 if (m_buddy->order != order) 1109 break; 1110 fl = (*seg->free_queues)[m_buddy->pool]; 1111 vm_freelist_rem(fl, m_buddy, order); 1112 if (m_buddy->pool != m->pool) 1113 vm_phys_set_pool(m->pool, m_buddy, order); 1114 order++; 1115 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1116 m = &seg->first_page[atop(pa - seg->start)]; 1117 } while (order < VM_NFREEORDER - 1); 1118 } 1119 fl = (*seg->free_queues)[m->pool]; 1120 vm_freelist_add(fl, m, order, 1); 1121 } 1122 1123 /* 1124 * Return the largest possible order of a set of pages starting at m. 1125 */ 1126 static int 1127 max_order(vm_page_t m) 1128 { 1129 1130 /* 1131 * Unsigned "min" is used here so that "order" is assigned 1132 * "VM_NFREEORDER - 1" when "m"'s physical address is zero 1133 * or the low-order bits of its physical address are zero 1134 * because the size of a physical address exceeds the size of 1135 * a long. 1136 */ 1137 return (min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, 1138 VM_NFREEORDER - 1)); 1139 } 1140 1141 /* 1142 * Free a contiguous, arbitrarily sized set of physical pages, without 1143 * merging across set boundaries. 1144 * 1145 * The free page queues must be locked. 1146 */ 1147 void 1148 vm_phys_enqueue_contig(vm_page_t m, u_long npages) 1149 { 1150 struct vm_freelist *fl; 1151 struct vm_phys_seg *seg; 1152 vm_page_t m_end; 1153 int order; 1154 1155 /* 1156 * Avoid unnecessary coalescing by freeing the pages in the largest 1157 * possible power-of-two-sized subsets. 1158 */ 1159 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1160 seg = &vm_phys_segs[m->segind]; 1161 fl = (*seg->free_queues)[m->pool]; 1162 m_end = m + npages; 1163 /* Free blocks of increasing size. */ 1164 while ((order = max_order(m)) < VM_NFREEORDER - 1 && 1165 m + (1 << order) <= m_end) { 1166 KASSERT(seg == &vm_phys_segs[m->segind], 1167 ("%s: page range [%p,%p) spans multiple segments", 1168 __func__, m_end - npages, m)); 1169 vm_freelist_add(fl, m, order, 1); 1170 m += 1 << order; 1171 } 1172 /* Free blocks of maximum size. */ 1173 while (m + (1 << order) <= m_end) { 1174 KASSERT(seg == &vm_phys_segs[m->segind], 1175 ("%s: page range [%p,%p) spans multiple segments", 1176 __func__, m_end - npages, m)); 1177 vm_freelist_add(fl, m, order, 1); 1178 m += 1 << order; 1179 } 1180 /* Free blocks of diminishing size. */ 1181 while (m < m_end) { 1182 KASSERT(seg == &vm_phys_segs[m->segind], 1183 ("%s: page range [%p,%p) spans multiple segments", 1184 __func__, m_end - npages, m)); 1185 order = flsl(m_end - m) - 1; 1186 vm_freelist_add(fl, m, order, 1); 1187 m += 1 << order; 1188 } 1189 } 1190 1191 /* 1192 * Free a contiguous, arbitrarily sized set of physical pages. 1193 * 1194 * The free page queues must be locked. 1195 */ 1196 void 1197 vm_phys_free_contig(vm_page_t m, u_long npages) 1198 { 1199 int order_start, order_end; 1200 vm_page_t m_start, m_end; 1201 1202 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1203 1204 m_start = m; 1205 order_start = max_order(m_start); 1206 if (order_start < VM_NFREEORDER - 1) 1207 m_start += 1 << order_start; 1208 m_end = m + npages; 1209 order_end = max_order(m_end); 1210 if (order_end < VM_NFREEORDER - 1) 1211 m_end -= 1 << order_end; 1212 /* 1213 * Avoid unnecessary coalescing by freeing the pages at the start and 1214 * end of the range last. 1215 */ 1216 if (m_start < m_end) 1217 vm_phys_enqueue_contig(m_start, m_end - m_start); 1218 if (order_start < VM_NFREEORDER - 1) 1219 vm_phys_free_pages(m, order_start); 1220 if (order_end < VM_NFREEORDER - 1) 1221 vm_phys_free_pages(m_end, order_end); 1222 } 1223 1224 /* 1225 * Scan physical memory between the specified addresses "low" and "high" for a 1226 * run of contiguous physical pages that satisfy the specified conditions, and 1227 * return the lowest page in the run. The specified "alignment" determines 1228 * the alignment of the lowest physical page in the run. If the specified 1229 * "boundary" is non-zero, then the run of physical pages cannot span a 1230 * physical address that is a multiple of "boundary". 1231 * 1232 * "npages" must be greater than zero. Both "alignment" and "boundary" must 1233 * be a power of two. 1234 */ 1235 vm_page_t 1236 vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1237 u_long alignment, vm_paddr_t boundary, int options) 1238 { 1239 vm_paddr_t pa_end; 1240 vm_page_t m_end, m_run, m_start; 1241 struct vm_phys_seg *seg; 1242 int segind; 1243 1244 KASSERT(npages > 0, ("npages is 0")); 1245 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1246 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1247 if (low >= high) 1248 return (NULL); 1249 for (segind = 0; segind < vm_phys_nsegs; segind++) { 1250 seg = &vm_phys_segs[segind]; 1251 if (seg->domain != domain) 1252 continue; 1253 if (seg->start >= high) 1254 break; 1255 if (low >= seg->end) 1256 continue; 1257 if (low <= seg->start) 1258 m_start = seg->first_page; 1259 else 1260 m_start = &seg->first_page[atop(low - seg->start)]; 1261 if (high < seg->end) 1262 pa_end = high; 1263 else 1264 pa_end = seg->end; 1265 if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) 1266 continue; 1267 m_end = &seg->first_page[atop(pa_end - seg->start)]; 1268 m_run = vm_page_scan_contig(npages, m_start, m_end, 1269 alignment, boundary, options); 1270 if (m_run != NULL) 1271 return (m_run); 1272 } 1273 return (NULL); 1274 } 1275 1276 /* 1277 * Set the pool for a contiguous, power of two-sized set of physical pages. 1278 */ 1279 void 1280 vm_phys_set_pool(int pool, vm_page_t m, int order) 1281 { 1282 vm_page_t m_tmp; 1283 1284 for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 1285 m_tmp->pool = pool; 1286 } 1287 1288 /* 1289 * Search for the given physical page "m" in the free lists. If the search 1290 * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return 1291 * FALSE, indicating that "m" is not in the free lists. 1292 * 1293 * The free page queues must be locked. 1294 */ 1295 boolean_t 1296 vm_phys_unfree_page(vm_page_t m) 1297 { 1298 struct vm_freelist *fl; 1299 struct vm_phys_seg *seg; 1300 vm_paddr_t pa, pa_half; 1301 vm_page_t m_set, m_tmp; 1302 int order; 1303 1304 /* 1305 * First, find the contiguous, power of two-sized set of free 1306 * physical pages containing the given physical page "m" and 1307 * assign it to "m_set". 1308 */ 1309 seg = &vm_phys_segs[m->segind]; 1310 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1311 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1312 order < VM_NFREEORDER - 1; ) { 1313 order++; 1314 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1315 if (pa >= seg->start) 1316 m_set = &seg->first_page[atop(pa - seg->start)]; 1317 else 1318 return (FALSE); 1319 } 1320 if (m_set->order < order) 1321 return (FALSE); 1322 if (m_set->order == VM_NFREEORDER) 1323 return (FALSE); 1324 KASSERT(m_set->order < VM_NFREEORDER, 1325 ("vm_phys_unfree_page: page %p has unexpected order %d", 1326 m_set, m_set->order)); 1327 1328 /* 1329 * Next, remove "m_set" from the free lists. Finally, extract 1330 * "m" from "m_set" using an iterative algorithm: While "m_set" 1331 * is larger than a page, shrink "m_set" by returning the half 1332 * of "m_set" that does not contain "m" to the free lists. 1333 */ 1334 fl = (*seg->free_queues)[m_set->pool]; 1335 order = m_set->order; 1336 vm_freelist_rem(fl, m_set, order); 1337 while (order > 0) { 1338 order--; 1339 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1340 if (m->phys_addr < pa_half) 1341 m_tmp = &seg->first_page[atop(pa_half - seg->start)]; 1342 else { 1343 m_tmp = m_set; 1344 m_set = &seg->first_page[atop(pa_half - seg->start)]; 1345 } 1346 vm_freelist_add(fl, m_tmp, order, 0); 1347 } 1348 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1349 return (TRUE); 1350 } 1351 1352 /* 1353 * Allocate a contiguous set of physical pages of the given size 1354 * "npages" from the free lists. All of the physical pages must be at 1355 * or above the given physical address "low" and below the given 1356 * physical address "high". The given value "alignment" determines the 1357 * alignment of the first physical page in the set. If the given value 1358 * "boundary" is non-zero, then the set of physical pages cannot cross 1359 * any physical address boundary that is a multiple of that value. Both 1360 * "alignment" and "boundary" must be a power of two. 1361 */ 1362 vm_page_t 1363 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1364 u_long alignment, vm_paddr_t boundary) 1365 { 1366 vm_paddr_t pa_end, pa_start; 1367 vm_page_t m_run; 1368 struct vm_phys_seg *seg; 1369 int segind; 1370 1371 KASSERT(npages > 0, ("npages is 0")); 1372 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1373 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1374 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1375 if (low >= high) 1376 return (NULL); 1377 m_run = NULL; 1378 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1379 seg = &vm_phys_segs[segind]; 1380 if (seg->start >= high || seg->domain != domain) 1381 continue; 1382 if (low >= seg->end) 1383 break; 1384 if (low <= seg->start) 1385 pa_start = seg->start; 1386 else 1387 pa_start = low; 1388 if (high < seg->end) 1389 pa_end = high; 1390 else 1391 pa_end = seg->end; 1392 if (pa_end - pa_start < ptoa(npages)) 1393 continue; 1394 m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, 1395 alignment, boundary); 1396 if (m_run != NULL) 1397 break; 1398 } 1399 return (m_run); 1400 } 1401 1402 /* 1403 * Allocate a run of contiguous physical pages from the free list for the 1404 * specified segment. 1405 */ 1406 static vm_page_t 1407 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, 1408 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1409 { 1410 struct vm_freelist *fl; 1411 vm_paddr_t pa, pa_end, size; 1412 vm_page_t m, m_ret; 1413 u_long npages_end; 1414 int oind, order, pind; 1415 1416 KASSERT(npages > 0, ("npages is 0")); 1417 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1418 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1419 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1420 /* Compute the queue that is the best fit for npages. */ 1421 order = flsl(npages - 1); 1422 /* Search for a run satisfying the specified conditions. */ 1423 size = npages << PAGE_SHIFT; 1424 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; 1425 oind++) { 1426 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1427 fl = (*seg->free_queues)[pind]; 1428 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1429 /* 1430 * Is the size of this allocation request 1431 * larger than the largest block size? 1432 */ 1433 if (order >= VM_NFREEORDER) { 1434 /* 1435 * Determine if a sufficient number of 1436 * subsequent blocks to satisfy the 1437 * allocation request are free. 1438 */ 1439 pa = VM_PAGE_TO_PHYS(m_ret); 1440 pa_end = pa + size; 1441 if (pa_end < pa) 1442 continue; 1443 for (;;) { 1444 pa += 1 << (PAGE_SHIFT + 1445 VM_NFREEORDER - 1); 1446 if (pa >= pa_end || 1447 pa < seg->start || 1448 pa >= seg->end) 1449 break; 1450 m = &seg->first_page[atop(pa - 1451 seg->start)]; 1452 if (m->order != VM_NFREEORDER - 1453 1) 1454 break; 1455 } 1456 /* If not, go to the next block. */ 1457 if (pa < pa_end) 1458 continue; 1459 } 1460 1461 /* 1462 * Determine if the blocks are within the 1463 * given range, satisfy the given alignment, 1464 * and do not cross the given boundary. 1465 */ 1466 pa = VM_PAGE_TO_PHYS(m_ret); 1467 pa_end = pa + size; 1468 if (pa >= low && pa_end <= high && 1469 (pa & (alignment - 1)) == 0 && 1470 rounddown2(pa ^ (pa_end - 1), boundary) == 0) 1471 goto done; 1472 } 1473 } 1474 } 1475 return (NULL); 1476 done: 1477 for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { 1478 fl = (*seg->free_queues)[m->pool]; 1479 vm_freelist_rem(fl, m, oind); 1480 if (m->pool != VM_FREEPOOL_DEFAULT) 1481 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1482 } 1483 /* Return excess pages to the free lists. */ 1484 npages_end = roundup2(npages, 1 << oind); 1485 if (npages < npages_end) { 1486 fl = (*seg->free_queues)[VM_FREEPOOL_DEFAULT]; 1487 vm_phys_enq_range(&m_ret[npages], npages_end - npages, fl, 0); 1488 } 1489 return (m_ret); 1490 } 1491 1492 /* 1493 * Return the index of the first unused slot which may be the terminating 1494 * entry. 1495 */ 1496 static int 1497 vm_phys_avail_count(void) 1498 { 1499 int i; 1500 1501 for (i = 0; phys_avail[i + 1]; i += 2) 1502 continue; 1503 if (i > PHYS_AVAIL_ENTRIES) 1504 panic("Improperly terminated phys_avail %d entries", i); 1505 1506 return (i); 1507 } 1508 1509 /* 1510 * Assert that a phys_avail entry is valid. 1511 */ 1512 static void 1513 vm_phys_avail_check(int i) 1514 { 1515 if (phys_avail[i] & PAGE_MASK) 1516 panic("Unaligned phys_avail[%d]: %#jx", i, 1517 (intmax_t)phys_avail[i]); 1518 if (phys_avail[i+1] & PAGE_MASK) 1519 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1520 (intmax_t)phys_avail[i]); 1521 if (phys_avail[i + 1] < phys_avail[i]) 1522 panic("phys_avail[%d] start %#jx < end %#jx", i, 1523 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1524 } 1525 1526 /* 1527 * Return the index of an overlapping phys_avail entry or -1. 1528 */ 1529 #ifdef NUMA 1530 static int 1531 vm_phys_avail_find(vm_paddr_t pa) 1532 { 1533 int i; 1534 1535 for (i = 0; phys_avail[i + 1]; i += 2) 1536 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1537 return (i); 1538 return (-1); 1539 } 1540 #endif 1541 1542 /* 1543 * Return the index of the largest entry. 1544 */ 1545 int 1546 vm_phys_avail_largest(void) 1547 { 1548 vm_paddr_t sz, largesz; 1549 int largest; 1550 int i; 1551 1552 largest = 0; 1553 largesz = 0; 1554 for (i = 0; phys_avail[i + 1]; i += 2) { 1555 sz = vm_phys_avail_size(i); 1556 if (sz > largesz) { 1557 largesz = sz; 1558 largest = i; 1559 } 1560 } 1561 1562 return (largest); 1563 } 1564 1565 vm_paddr_t 1566 vm_phys_avail_size(int i) 1567 { 1568 1569 return (phys_avail[i + 1] - phys_avail[i]); 1570 } 1571 1572 /* 1573 * Split an entry at the address 'pa'. Return zero on success or errno. 1574 */ 1575 static int 1576 vm_phys_avail_split(vm_paddr_t pa, int i) 1577 { 1578 int cnt; 1579 1580 vm_phys_avail_check(i); 1581 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1582 panic("vm_phys_avail_split: invalid address"); 1583 cnt = vm_phys_avail_count(); 1584 if (cnt >= PHYS_AVAIL_ENTRIES) 1585 return (ENOSPC); 1586 memmove(&phys_avail[i + 2], &phys_avail[i], 1587 (cnt - i) * sizeof(phys_avail[0])); 1588 phys_avail[i + 1] = pa; 1589 phys_avail[i + 2] = pa; 1590 vm_phys_avail_check(i); 1591 vm_phys_avail_check(i+2); 1592 1593 return (0); 1594 } 1595 1596 /* 1597 * Check if a given physical address can be included as part of a crash dump. 1598 */ 1599 bool 1600 vm_phys_is_dumpable(vm_paddr_t pa) 1601 { 1602 vm_page_t m; 1603 int i; 1604 1605 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1606 return ((m->flags & PG_NODUMP) == 0); 1607 1608 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1609 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1610 return (true); 1611 } 1612 return (false); 1613 } 1614 1615 void 1616 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1617 { 1618 struct vm_phys_seg *seg; 1619 1620 if (vm_phys_early_nsegs == -1) 1621 panic("%s: called after initialization", __func__); 1622 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1623 panic("%s: ran out of early segments", __func__); 1624 1625 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1626 seg->start = start; 1627 seg->end = end; 1628 } 1629 1630 /* 1631 * This routine allocates NUMA node specific memory before the page 1632 * allocator is bootstrapped. 1633 */ 1634 vm_paddr_t 1635 vm_phys_early_alloc(int domain, size_t alloc_size) 1636 { 1637 int i, mem_index, biggestone; 1638 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1639 1640 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1641 ("%s: invalid domain index %d", __func__, domain)); 1642 1643 /* 1644 * Search the mem_affinity array for the biggest address 1645 * range in the desired domain. This is used to constrain 1646 * the phys_avail selection below. 1647 */ 1648 biggestsize = 0; 1649 mem_index = 0; 1650 mem_start = 0; 1651 mem_end = -1; 1652 #ifdef NUMA 1653 if (mem_affinity != NULL) { 1654 for (i = 0;; i++) { 1655 size = mem_affinity[i].end - mem_affinity[i].start; 1656 if (size == 0) 1657 break; 1658 if (domain != -1 && mem_affinity[i].domain != domain) 1659 continue; 1660 if (size > biggestsize) { 1661 mem_index = i; 1662 biggestsize = size; 1663 } 1664 } 1665 mem_start = mem_affinity[mem_index].start; 1666 mem_end = mem_affinity[mem_index].end; 1667 } 1668 #endif 1669 1670 /* 1671 * Now find biggest physical segment in within the desired 1672 * numa domain. 1673 */ 1674 biggestsize = 0; 1675 biggestone = 0; 1676 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1677 /* skip regions that are out of range */ 1678 if (phys_avail[i+1] - alloc_size < mem_start || 1679 phys_avail[i+1] > mem_end) 1680 continue; 1681 size = vm_phys_avail_size(i); 1682 if (size > biggestsize) { 1683 biggestone = i; 1684 biggestsize = size; 1685 } 1686 } 1687 alloc_size = round_page(alloc_size); 1688 1689 /* 1690 * Grab single pages from the front to reduce fragmentation. 1691 */ 1692 if (alloc_size == PAGE_SIZE) { 1693 pa = phys_avail[biggestone]; 1694 phys_avail[biggestone] += PAGE_SIZE; 1695 vm_phys_avail_check(biggestone); 1696 return (pa); 1697 } 1698 1699 /* 1700 * Naturally align large allocations. 1701 */ 1702 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1703 if (alloc_size + align > biggestsize) 1704 panic("cannot find a large enough size\n"); 1705 if (align != 0 && 1706 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1707 biggestone) != 0) 1708 /* Wasting memory. */ 1709 phys_avail[biggestone + 1] -= align; 1710 1711 phys_avail[biggestone + 1] -= alloc_size; 1712 vm_phys_avail_check(biggestone); 1713 pa = phys_avail[biggestone + 1]; 1714 return (pa); 1715 } 1716 1717 void 1718 vm_phys_early_startup(void) 1719 { 1720 struct vm_phys_seg *seg; 1721 int i; 1722 1723 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1724 phys_avail[i] = round_page(phys_avail[i]); 1725 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1726 } 1727 1728 for (i = 0; i < vm_phys_early_nsegs; i++) { 1729 seg = &vm_phys_early_segs[i]; 1730 vm_phys_add_seg(seg->start, seg->end); 1731 } 1732 vm_phys_early_nsegs = -1; 1733 1734 #ifdef NUMA 1735 /* Force phys_avail to be split by domain. */ 1736 if (mem_affinity != NULL) { 1737 int idx; 1738 1739 for (i = 0; mem_affinity[i].end != 0; i++) { 1740 idx = vm_phys_avail_find(mem_affinity[i].start); 1741 if (idx != -1 && 1742 phys_avail[idx] != mem_affinity[i].start) 1743 vm_phys_avail_split(mem_affinity[i].start, idx); 1744 idx = vm_phys_avail_find(mem_affinity[i].end); 1745 if (idx != -1 && 1746 phys_avail[idx] != mem_affinity[i].end) 1747 vm_phys_avail_split(mem_affinity[i].end, idx); 1748 } 1749 } 1750 #endif 1751 } 1752 1753 #ifdef DDB 1754 /* 1755 * Show the number of physical pages in each of the free lists. 1756 */ 1757 DB_SHOW_COMMAND(freepages, db_show_freepages) 1758 { 1759 struct vm_freelist *fl; 1760 int flind, oind, pind, dom; 1761 1762 for (dom = 0; dom < vm_ndomains; dom++) { 1763 db_printf("DOMAIN: %d\n", dom); 1764 for (flind = 0; flind < vm_nfreelists; flind++) { 1765 db_printf("FREE LIST %d:\n" 1766 "\n ORDER (SIZE) | NUMBER" 1767 "\n ", flind); 1768 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1769 db_printf(" | POOL %d", pind); 1770 db_printf("\n-- "); 1771 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1772 db_printf("-- -- "); 1773 db_printf("--\n"); 1774 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 1775 db_printf(" %2.2d (%6.6dK)", oind, 1776 1 << (PAGE_SHIFT - 10 + oind)); 1777 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1778 fl = vm_phys_free_queues[dom][flind][pind]; 1779 db_printf(" | %6.6d", fl[oind].lcnt); 1780 } 1781 db_printf("\n"); 1782 } 1783 db_printf("\n"); 1784 } 1785 db_printf("\n"); 1786 } 1787 } 1788 #endif 1789