1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ddb.h" 45 #include "opt_vm.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/domainset.h> 50 #include <sys/lock.h> 51 #include <sys/kernel.h> 52 #include <sys/malloc.h> 53 #include <sys/mutex.h> 54 #include <sys/proc.h> 55 #include <sys/queue.h> 56 #include <sys/rwlock.h> 57 #include <sys/sbuf.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/vmmeter.h> 61 62 #include <ddb/ddb.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_param.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_page.h> 69 #include <vm/vm_phys.h> 70 #include <vm/vm_pagequeue.h> 71 72 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 73 "Too many physsegs."); 74 75 #ifdef NUMA 76 struct mem_affinity __read_mostly *mem_affinity; 77 int __read_mostly *mem_locality; 78 #endif 79 80 int __read_mostly vm_ndomains = 1; 81 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 82 83 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 84 int __read_mostly vm_phys_nsegs; 85 static struct vm_phys_seg vm_phys_early_segs[8]; 86 static int vm_phys_early_nsegs; 87 88 struct vm_phys_fictitious_seg; 89 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 90 struct vm_phys_fictitious_seg *); 91 92 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 93 RB_INITIALIZER(&vm_phys_fictitious_tree); 94 95 struct vm_phys_fictitious_seg { 96 RB_ENTRY(vm_phys_fictitious_seg) node; 97 /* Memory region data */ 98 vm_paddr_t start; 99 vm_paddr_t end; 100 vm_page_t first_page; 101 }; 102 103 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 104 vm_phys_fictitious_cmp); 105 106 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 107 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 108 109 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 110 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 111 [VM_NFREEORDER_MAX]; 112 113 static int __read_mostly vm_nfreelists; 114 115 /* 116 * These "avail lists" are globals used to communicate boot-time physical 117 * memory layout to other parts of the kernel. Each physically contiguous 118 * region of memory is defined by a start address at an even index and an 119 * end address at the following odd index. Each list is terminated by a 120 * pair of zero entries. 121 * 122 * dump_avail tells the dump code what regions to include in a crash dump, and 123 * phys_avail is all of the remaining physical memory that is available for 124 * the vm system. 125 * 126 * Initially dump_avail and phys_avail are identical. Boot time memory 127 * allocations remove extents from phys_avail that may still be included 128 * in dumps. 129 */ 130 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 131 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 132 133 /* 134 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 135 */ 136 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 137 138 CTASSERT(VM_FREELIST_DEFAULT == 0); 139 140 #ifdef VM_FREELIST_DMA32 141 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 142 #endif 143 144 /* 145 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 146 * the ordering of the free list boundaries. 147 */ 148 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 149 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 150 #endif 151 152 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 153 SYSCTL_OID(_vm, OID_AUTO, phys_free, 154 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 155 sysctl_vm_phys_free, "A", 156 "Phys Free Info"); 157 158 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 159 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 160 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 161 sysctl_vm_phys_segs, "A", 162 "Phys Seg Info"); 163 164 #ifdef NUMA 165 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 166 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 167 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 168 sysctl_vm_phys_locality, "A", 169 "Phys Locality Info"); 170 #endif 171 172 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 173 &vm_ndomains, 0, "Number of physical memory domains available."); 174 175 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, 176 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 177 vm_paddr_t boundary); 178 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 179 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 180 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 181 int order, int tail); 182 /* 183 * Red-black tree helpers for vm fictitious range management. 184 */ 185 static inline int 186 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 187 struct vm_phys_fictitious_seg *range) 188 { 189 190 KASSERT(range->start != 0 && range->end != 0, 191 ("Invalid range passed on search for vm_fictitious page")); 192 if (p->start >= range->end) 193 return (1); 194 if (p->start < range->start) 195 return (-1); 196 197 return (0); 198 } 199 200 static int 201 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 202 struct vm_phys_fictitious_seg *p2) 203 { 204 205 /* Check if this is a search for a page */ 206 if (p1->end == 0) 207 return (vm_phys_fictitious_in_range(p1, p2)); 208 209 KASSERT(p2->end != 0, 210 ("Invalid range passed as second parameter to vm fictitious comparison")); 211 212 /* Searching to add a new range */ 213 if (p1->end <= p2->start) 214 return (-1); 215 if (p1->start >= p2->end) 216 return (1); 217 218 panic("Trying to add overlapping vm fictitious ranges:\n" 219 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 220 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 221 } 222 223 int 224 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high) 225 { 226 #ifdef NUMA 227 domainset_t mask; 228 int i; 229 230 if (vm_ndomains == 1 || mem_affinity == NULL) 231 return (0); 232 233 DOMAINSET_ZERO(&mask); 234 /* 235 * Check for any memory that overlaps low, high. 236 */ 237 for (i = 0; mem_affinity[i].end != 0; i++) 238 if (mem_affinity[i].start <= high && 239 mem_affinity[i].end >= low) 240 DOMAINSET_SET(mem_affinity[i].domain, &mask); 241 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 242 return (prefer); 243 if (DOMAINSET_EMPTY(&mask)) 244 panic("vm_phys_domain_match: Impossible constraint"); 245 return (DOMAINSET_FFS(&mask) - 1); 246 #else 247 return (0); 248 #endif 249 } 250 251 /* 252 * Outputs the state of the physical memory allocator, specifically, 253 * the amount of physical memory in each free list. 254 */ 255 static int 256 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 257 { 258 struct sbuf sbuf; 259 struct vm_freelist *fl; 260 int dom, error, flind, oind, pind; 261 262 error = sysctl_wire_old_buffer(req, 0); 263 if (error != 0) 264 return (error); 265 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 266 for (dom = 0; dom < vm_ndomains; dom++) { 267 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 268 for (flind = 0; flind < vm_nfreelists; flind++) { 269 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 270 "\n ORDER (SIZE) | NUMBER" 271 "\n ", flind); 272 for (pind = 0; pind < VM_NFREEPOOL; pind++) 273 sbuf_printf(&sbuf, " | POOL %d", pind); 274 sbuf_printf(&sbuf, "\n-- "); 275 for (pind = 0; pind < VM_NFREEPOOL; pind++) 276 sbuf_printf(&sbuf, "-- -- "); 277 sbuf_printf(&sbuf, "--\n"); 278 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 279 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 280 1 << (PAGE_SHIFT - 10 + oind)); 281 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 282 fl = vm_phys_free_queues[dom][flind][pind]; 283 sbuf_printf(&sbuf, " | %6d", 284 fl[oind].lcnt); 285 } 286 sbuf_printf(&sbuf, "\n"); 287 } 288 } 289 } 290 error = sbuf_finish(&sbuf); 291 sbuf_delete(&sbuf); 292 return (error); 293 } 294 295 /* 296 * Outputs the set of physical memory segments. 297 */ 298 static int 299 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 300 { 301 struct sbuf sbuf; 302 struct vm_phys_seg *seg; 303 int error, segind; 304 305 error = sysctl_wire_old_buffer(req, 0); 306 if (error != 0) 307 return (error); 308 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 309 for (segind = 0; segind < vm_phys_nsegs; segind++) { 310 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 311 seg = &vm_phys_segs[segind]; 312 sbuf_printf(&sbuf, "start: %#jx\n", 313 (uintmax_t)seg->start); 314 sbuf_printf(&sbuf, "end: %#jx\n", 315 (uintmax_t)seg->end); 316 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 317 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 318 } 319 error = sbuf_finish(&sbuf); 320 sbuf_delete(&sbuf); 321 return (error); 322 } 323 324 /* 325 * Return affinity, or -1 if there's no affinity information. 326 */ 327 int 328 vm_phys_mem_affinity(int f, int t) 329 { 330 331 #ifdef NUMA 332 if (mem_locality == NULL) 333 return (-1); 334 if (f >= vm_ndomains || t >= vm_ndomains) 335 return (-1); 336 return (mem_locality[f * vm_ndomains + t]); 337 #else 338 return (-1); 339 #endif 340 } 341 342 #ifdef NUMA 343 /* 344 * Outputs the VM locality table. 345 */ 346 static int 347 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 348 { 349 struct sbuf sbuf; 350 int error, i, j; 351 352 error = sysctl_wire_old_buffer(req, 0); 353 if (error != 0) 354 return (error); 355 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 356 357 sbuf_printf(&sbuf, "\n"); 358 359 for (i = 0; i < vm_ndomains; i++) { 360 sbuf_printf(&sbuf, "%d: ", i); 361 for (j = 0; j < vm_ndomains; j++) { 362 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 363 } 364 sbuf_printf(&sbuf, "\n"); 365 } 366 error = sbuf_finish(&sbuf); 367 sbuf_delete(&sbuf); 368 return (error); 369 } 370 #endif 371 372 static void 373 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 374 { 375 376 m->order = order; 377 if (tail) 378 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 379 else 380 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 381 fl[order].lcnt++; 382 } 383 384 static void 385 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 386 { 387 388 TAILQ_REMOVE(&fl[order].pl, m, listq); 389 fl[order].lcnt--; 390 m->order = VM_NFREEORDER; 391 } 392 393 /* 394 * Create a physical memory segment. 395 */ 396 static void 397 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 398 { 399 struct vm_phys_seg *seg; 400 401 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 402 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 403 KASSERT(domain >= 0 && domain < vm_ndomains, 404 ("vm_phys_create_seg: invalid domain provided")); 405 seg = &vm_phys_segs[vm_phys_nsegs++]; 406 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 407 *seg = *(seg - 1); 408 seg--; 409 } 410 seg->start = start; 411 seg->end = end; 412 seg->domain = domain; 413 } 414 415 static void 416 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 417 { 418 #ifdef NUMA 419 int i; 420 421 if (mem_affinity == NULL) { 422 _vm_phys_create_seg(start, end, 0); 423 return; 424 } 425 426 for (i = 0;; i++) { 427 if (mem_affinity[i].end == 0) 428 panic("Reached end of affinity info"); 429 if (mem_affinity[i].end <= start) 430 continue; 431 if (mem_affinity[i].start > start) 432 panic("No affinity info for start %jx", 433 (uintmax_t)start); 434 if (mem_affinity[i].end >= end) { 435 _vm_phys_create_seg(start, end, 436 mem_affinity[i].domain); 437 break; 438 } 439 _vm_phys_create_seg(start, mem_affinity[i].end, 440 mem_affinity[i].domain); 441 start = mem_affinity[i].end; 442 } 443 #else 444 _vm_phys_create_seg(start, end, 0); 445 #endif 446 } 447 448 /* 449 * Add a physical memory segment. 450 */ 451 void 452 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 453 { 454 vm_paddr_t paddr; 455 456 KASSERT((start & PAGE_MASK) == 0, 457 ("vm_phys_define_seg: start is not page aligned")); 458 KASSERT((end & PAGE_MASK) == 0, 459 ("vm_phys_define_seg: end is not page aligned")); 460 461 /* 462 * Split the physical memory segment if it spans two or more free 463 * list boundaries. 464 */ 465 paddr = start; 466 #ifdef VM_FREELIST_LOWMEM 467 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 468 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 469 paddr = VM_LOWMEM_BOUNDARY; 470 } 471 #endif 472 #ifdef VM_FREELIST_DMA32 473 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 474 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 475 paddr = VM_DMA32_BOUNDARY; 476 } 477 #endif 478 vm_phys_create_seg(paddr, end); 479 } 480 481 /* 482 * Initialize the physical memory allocator. 483 * 484 * Requires that vm_page_array is initialized! 485 */ 486 void 487 vm_phys_init(void) 488 { 489 struct vm_freelist *fl; 490 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 491 u_long npages; 492 int dom, flind, freelist, oind, pind, segind; 493 494 /* 495 * Compute the number of free lists, and generate the mapping from the 496 * manifest constants VM_FREELIST_* to the free list indices. 497 * 498 * Initially, the entries of vm_freelist_to_flind[] are set to either 499 * 0 or 1 to indicate which free lists should be created. 500 */ 501 npages = 0; 502 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 503 seg = &vm_phys_segs[segind]; 504 #ifdef VM_FREELIST_LOWMEM 505 if (seg->end <= VM_LOWMEM_BOUNDARY) 506 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 507 else 508 #endif 509 #ifdef VM_FREELIST_DMA32 510 if ( 511 #ifdef VM_DMA32_NPAGES_THRESHOLD 512 /* 513 * Create the DMA32 free list only if the amount of 514 * physical memory above physical address 4G exceeds the 515 * given threshold. 516 */ 517 npages > VM_DMA32_NPAGES_THRESHOLD && 518 #endif 519 seg->end <= VM_DMA32_BOUNDARY) 520 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 521 else 522 #endif 523 { 524 npages += atop(seg->end - seg->start); 525 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 526 } 527 } 528 /* Change each entry into a running total of the free lists. */ 529 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 530 vm_freelist_to_flind[freelist] += 531 vm_freelist_to_flind[freelist - 1]; 532 } 533 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 534 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 535 /* Change each entry into a free list index. */ 536 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 537 vm_freelist_to_flind[freelist]--; 538 539 /* 540 * Initialize the first_page and free_queues fields of each physical 541 * memory segment. 542 */ 543 #ifdef VM_PHYSSEG_SPARSE 544 npages = 0; 545 #endif 546 for (segind = 0; segind < vm_phys_nsegs; segind++) { 547 seg = &vm_phys_segs[segind]; 548 #ifdef VM_PHYSSEG_SPARSE 549 seg->first_page = &vm_page_array[npages]; 550 npages += atop(seg->end - seg->start); 551 #else 552 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 553 #endif 554 #ifdef VM_FREELIST_LOWMEM 555 if (seg->end <= VM_LOWMEM_BOUNDARY) { 556 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 557 KASSERT(flind >= 0, 558 ("vm_phys_init: LOWMEM flind < 0")); 559 } else 560 #endif 561 #ifdef VM_FREELIST_DMA32 562 if (seg->end <= VM_DMA32_BOUNDARY) { 563 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 564 KASSERT(flind >= 0, 565 ("vm_phys_init: DMA32 flind < 0")); 566 } else 567 #endif 568 { 569 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 570 KASSERT(flind >= 0, 571 ("vm_phys_init: DEFAULT flind < 0")); 572 } 573 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 574 } 575 576 /* 577 * Coalesce physical memory segments that are contiguous and share the 578 * same per-domain free queues. 579 */ 580 prev_seg = vm_phys_segs; 581 seg = &vm_phys_segs[1]; 582 end_seg = &vm_phys_segs[vm_phys_nsegs]; 583 while (seg < end_seg) { 584 if (prev_seg->end == seg->start && 585 prev_seg->free_queues == seg->free_queues) { 586 prev_seg->end = seg->end; 587 KASSERT(prev_seg->domain == seg->domain, 588 ("vm_phys_init: free queues cannot span domains")); 589 vm_phys_nsegs--; 590 end_seg--; 591 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 592 *tmp_seg = *(tmp_seg + 1); 593 } else { 594 prev_seg = seg; 595 seg++; 596 } 597 } 598 599 /* 600 * Initialize the free queues. 601 */ 602 for (dom = 0; dom < vm_ndomains; dom++) { 603 for (flind = 0; flind < vm_nfreelists; flind++) { 604 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 605 fl = vm_phys_free_queues[dom][flind][pind]; 606 for (oind = 0; oind < VM_NFREEORDER; oind++) 607 TAILQ_INIT(&fl[oind].pl); 608 } 609 } 610 } 611 612 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 613 } 614 615 /* 616 * Register info about the NUMA topology of the system. 617 * 618 * Invoked by platform-dependent code prior to vm_phys_init(). 619 */ 620 void 621 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, 622 int *locality) 623 { 624 #ifdef NUMA 625 int d, i; 626 627 /* 628 * For now the only override value that we support is 1, which 629 * effectively disables NUMA-awareness in the allocators. 630 */ 631 d = 0; 632 TUNABLE_INT_FETCH("vm.numa.disabled", &d); 633 if (d) 634 ndomains = 1; 635 636 if (ndomains > 1) { 637 vm_ndomains = ndomains; 638 mem_affinity = affinity; 639 mem_locality = locality; 640 } 641 642 for (i = 0; i < vm_ndomains; i++) 643 DOMAINSET_SET(i, &all_domains); 644 #else 645 (void)ndomains; 646 (void)affinity; 647 (void)locality; 648 #endif 649 } 650 651 /* 652 * Split a contiguous, power of two-sized set of physical pages. 653 * 654 * When this function is called by a page allocation function, the caller 655 * should request insertion at the head unless the order [order, oind) queues 656 * are known to be empty. The objective being to reduce the likelihood of 657 * long-term fragmentation by promoting contemporaneous allocation and 658 * (hopefully) deallocation. 659 */ 660 static __inline void 661 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 662 int tail) 663 { 664 vm_page_t m_buddy; 665 666 while (oind > order) { 667 oind--; 668 m_buddy = &m[1 << oind]; 669 KASSERT(m_buddy->order == VM_NFREEORDER, 670 ("vm_phys_split_pages: page %p has unexpected order %d", 671 m_buddy, m_buddy->order)); 672 vm_freelist_add(fl, m_buddy, oind, tail); 673 } 674 } 675 676 /* 677 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 678 * and sized set to the specified free list. 679 * 680 * When this function is called by a page allocation function, the caller 681 * should request insertion at the head unless the lower-order queues are 682 * known to be empty. The objective being to reduce the likelihood of long- 683 * term fragmentation by promoting contemporaneous allocation and (hopefully) 684 * deallocation. 685 * 686 * The physical page m's buddy must not be free. 687 */ 688 static void 689 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 690 { 691 u_int n; 692 int order; 693 694 KASSERT(npages > 0, ("vm_phys_enq_range: npages is 0")); 695 KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 696 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 697 ("vm_phys_enq_range: page %p and npages %u are misaligned", 698 m, npages)); 699 do { 700 KASSERT(m->order == VM_NFREEORDER, 701 ("vm_phys_enq_range: page %p has unexpected order %d", 702 m, m->order)); 703 order = ffs(npages) - 1; 704 KASSERT(order < VM_NFREEORDER, 705 ("vm_phys_enq_range: order %d is out of range", order)); 706 vm_freelist_add(fl, m, order, tail); 707 n = 1 << order; 708 m += n; 709 npages -= n; 710 } while (npages > 0); 711 } 712 713 /* 714 * Set the pool for a contiguous, power of two-sized set of physical pages. 715 */ 716 static void 717 vm_phys_set_pool(int pool, vm_page_t m, int order) 718 { 719 vm_page_t m_tmp; 720 721 for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 722 m_tmp->pool = pool; 723 } 724 725 /* 726 * Tries to allocate the specified number of pages from the specified pool 727 * within the specified domain. Returns the actual number of allocated pages 728 * and a pointer to each page through the array ma[]. 729 * 730 * The returned pages may not be physically contiguous. However, in contrast 731 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 732 * calling this function once to allocate the desired number of pages will 733 * avoid wasted time in vm_phys_split_pages(). 734 * 735 * The free page queues for the specified domain must be locked. 736 */ 737 int 738 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 739 { 740 struct vm_freelist *alt, *fl; 741 vm_page_t m; 742 int avail, end, flind, freelist, i, need, oind, pind; 743 744 KASSERT(domain >= 0 && domain < vm_ndomains, 745 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 746 KASSERT(pool < VM_NFREEPOOL, 747 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 748 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 749 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 750 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 751 i = 0; 752 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 753 flind = vm_freelist_to_flind[freelist]; 754 if (flind < 0) 755 continue; 756 fl = vm_phys_free_queues[domain][flind][pool]; 757 for (oind = 0; oind < VM_NFREEORDER; oind++) { 758 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 759 vm_freelist_rem(fl, m, oind); 760 avail = 1 << oind; 761 need = imin(npages - i, avail); 762 for (end = i + need; i < end;) 763 ma[i++] = m++; 764 if (need < avail) { 765 /* 766 * Return excess pages to fl. Its 767 * order [0, oind) queues are empty. 768 */ 769 vm_phys_enq_range(m, avail - need, fl, 770 1); 771 return (npages); 772 } else if (i == npages) 773 return (npages); 774 } 775 } 776 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 777 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 778 alt = vm_phys_free_queues[domain][flind][pind]; 779 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 780 NULL) { 781 vm_freelist_rem(alt, m, oind); 782 vm_phys_set_pool(pool, m, oind); 783 avail = 1 << oind; 784 need = imin(npages - i, avail); 785 for (end = i + need; i < end;) 786 ma[i++] = m++; 787 if (need < avail) { 788 /* 789 * Return excess pages to fl. 790 * Its order [0, oind) queues 791 * are empty. 792 */ 793 vm_phys_enq_range(m, avail - 794 need, fl, 1); 795 return (npages); 796 } else if (i == npages) 797 return (npages); 798 } 799 } 800 } 801 } 802 return (i); 803 } 804 805 /* 806 * Allocate a contiguous, power of two-sized set of physical pages 807 * from the free lists. 808 * 809 * The free page queues must be locked. 810 */ 811 vm_page_t 812 vm_phys_alloc_pages(int domain, int pool, int order) 813 { 814 vm_page_t m; 815 int freelist; 816 817 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 818 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 819 if (m != NULL) 820 return (m); 821 } 822 return (NULL); 823 } 824 825 /* 826 * Allocate a contiguous, power of two-sized set of physical pages from the 827 * specified free list. The free list must be specified using one of the 828 * manifest constants VM_FREELIST_*. 829 * 830 * The free page queues must be locked. 831 */ 832 vm_page_t 833 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 834 { 835 struct vm_freelist *alt, *fl; 836 vm_page_t m; 837 int oind, pind, flind; 838 839 KASSERT(domain >= 0 && domain < vm_ndomains, 840 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 841 domain)); 842 KASSERT(freelist < VM_NFREELIST, 843 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 844 freelist)); 845 KASSERT(pool < VM_NFREEPOOL, 846 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 847 KASSERT(order < VM_NFREEORDER, 848 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 849 850 flind = vm_freelist_to_flind[freelist]; 851 /* Check if freelist is present */ 852 if (flind < 0) 853 return (NULL); 854 855 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 856 fl = &vm_phys_free_queues[domain][flind][pool][0]; 857 for (oind = order; oind < VM_NFREEORDER; oind++) { 858 m = TAILQ_FIRST(&fl[oind].pl); 859 if (m != NULL) { 860 vm_freelist_rem(fl, m, oind); 861 /* The order [order, oind) queues are empty. */ 862 vm_phys_split_pages(m, oind, fl, order, 1); 863 return (m); 864 } 865 } 866 867 /* 868 * The given pool was empty. Find the largest 869 * contiguous, power-of-two-sized set of pages in any 870 * pool. Transfer these pages to the given pool, and 871 * use them to satisfy the allocation. 872 */ 873 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 874 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 875 alt = &vm_phys_free_queues[domain][flind][pind][0]; 876 m = TAILQ_FIRST(&alt[oind].pl); 877 if (m != NULL) { 878 vm_freelist_rem(alt, m, oind); 879 vm_phys_set_pool(pool, m, oind); 880 /* The order [order, oind) queues are empty. */ 881 vm_phys_split_pages(m, oind, fl, order, 1); 882 return (m); 883 } 884 } 885 } 886 return (NULL); 887 } 888 889 /* 890 * Find the vm_page corresponding to the given physical address. 891 */ 892 vm_page_t 893 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 894 { 895 struct vm_phys_seg *seg; 896 int segind; 897 898 for (segind = 0; segind < vm_phys_nsegs; segind++) { 899 seg = &vm_phys_segs[segind]; 900 if (pa >= seg->start && pa < seg->end) 901 return (&seg->first_page[atop(pa - seg->start)]); 902 } 903 return (NULL); 904 } 905 906 vm_page_t 907 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 908 { 909 struct vm_phys_fictitious_seg tmp, *seg; 910 vm_page_t m; 911 912 m = NULL; 913 tmp.start = pa; 914 tmp.end = 0; 915 916 rw_rlock(&vm_phys_fictitious_reg_lock); 917 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 918 rw_runlock(&vm_phys_fictitious_reg_lock); 919 if (seg == NULL) 920 return (NULL); 921 922 m = &seg->first_page[atop(pa - seg->start)]; 923 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 924 925 return (m); 926 } 927 928 static inline void 929 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 930 long page_count, vm_memattr_t memattr) 931 { 932 long i; 933 934 bzero(range, page_count * sizeof(*range)); 935 for (i = 0; i < page_count; i++) { 936 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 937 range[i].oflags &= ~VPO_UNMANAGED; 938 range[i].busy_lock = VPB_UNBUSIED; 939 } 940 } 941 942 int 943 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 944 vm_memattr_t memattr) 945 { 946 struct vm_phys_fictitious_seg *seg; 947 vm_page_t fp; 948 long page_count; 949 #ifdef VM_PHYSSEG_DENSE 950 long pi, pe; 951 long dpage_count; 952 #endif 953 954 KASSERT(start < end, 955 ("Start of segment isn't less than end (start: %jx end: %jx)", 956 (uintmax_t)start, (uintmax_t)end)); 957 958 page_count = (end - start) / PAGE_SIZE; 959 960 #ifdef VM_PHYSSEG_DENSE 961 pi = atop(start); 962 pe = atop(end); 963 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 964 fp = &vm_page_array[pi - first_page]; 965 if ((pe - first_page) > vm_page_array_size) { 966 /* 967 * We have a segment that starts inside 968 * of vm_page_array, but ends outside of it. 969 * 970 * Use vm_page_array pages for those that are 971 * inside of the vm_page_array range, and 972 * allocate the remaining ones. 973 */ 974 dpage_count = vm_page_array_size - (pi - first_page); 975 vm_phys_fictitious_init_range(fp, start, dpage_count, 976 memattr); 977 page_count -= dpage_count; 978 start += ptoa(dpage_count); 979 goto alloc; 980 } 981 /* 982 * We can allocate the full range from vm_page_array, 983 * so there's no need to register the range in the tree. 984 */ 985 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 986 return (0); 987 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 988 /* 989 * We have a segment that ends inside of vm_page_array, 990 * but starts outside of it. 991 */ 992 fp = &vm_page_array[0]; 993 dpage_count = pe - first_page; 994 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 995 memattr); 996 end -= ptoa(dpage_count); 997 page_count -= dpage_count; 998 goto alloc; 999 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1000 /* 1001 * Trying to register a fictitious range that expands before 1002 * and after vm_page_array. 1003 */ 1004 return (EINVAL); 1005 } else { 1006 alloc: 1007 #endif 1008 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1009 M_WAITOK); 1010 #ifdef VM_PHYSSEG_DENSE 1011 } 1012 #endif 1013 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1014 1015 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1016 seg->start = start; 1017 seg->end = end; 1018 seg->first_page = fp; 1019 1020 rw_wlock(&vm_phys_fictitious_reg_lock); 1021 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1022 rw_wunlock(&vm_phys_fictitious_reg_lock); 1023 1024 return (0); 1025 } 1026 1027 void 1028 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1029 { 1030 struct vm_phys_fictitious_seg *seg, tmp; 1031 #ifdef VM_PHYSSEG_DENSE 1032 long pi, pe; 1033 #endif 1034 1035 KASSERT(start < end, 1036 ("Start of segment isn't less than end (start: %jx end: %jx)", 1037 (uintmax_t)start, (uintmax_t)end)); 1038 1039 #ifdef VM_PHYSSEG_DENSE 1040 pi = atop(start); 1041 pe = atop(end); 1042 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1043 if ((pe - first_page) <= vm_page_array_size) { 1044 /* 1045 * This segment was allocated using vm_page_array 1046 * only, there's nothing to do since those pages 1047 * were never added to the tree. 1048 */ 1049 return; 1050 } 1051 /* 1052 * We have a segment that starts inside 1053 * of vm_page_array, but ends outside of it. 1054 * 1055 * Calculate how many pages were added to the 1056 * tree and free them. 1057 */ 1058 start = ptoa(first_page + vm_page_array_size); 1059 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1060 /* 1061 * We have a segment that ends inside of vm_page_array, 1062 * but starts outside of it. 1063 */ 1064 end = ptoa(first_page); 1065 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1066 /* Since it's not possible to register such a range, panic. */ 1067 panic( 1068 "Unregistering not registered fictitious range [%#jx:%#jx]", 1069 (uintmax_t)start, (uintmax_t)end); 1070 } 1071 #endif 1072 tmp.start = start; 1073 tmp.end = 0; 1074 1075 rw_wlock(&vm_phys_fictitious_reg_lock); 1076 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1077 if (seg->start != start || seg->end != end) { 1078 rw_wunlock(&vm_phys_fictitious_reg_lock); 1079 panic( 1080 "Unregistering not registered fictitious range [%#jx:%#jx]", 1081 (uintmax_t)start, (uintmax_t)end); 1082 } 1083 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1084 rw_wunlock(&vm_phys_fictitious_reg_lock); 1085 free(seg->first_page, M_FICT_PAGES); 1086 free(seg, M_FICT_PAGES); 1087 } 1088 1089 /* 1090 * Free a contiguous, power of two-sized set of physical pages. 1091 * 1092 * The free page queues must be locked. 1093 */ 1094 void 1095 vm_phys_free_pages(vm_page_t m, int order) 1096 { 1097 struct vm_freelist *fl; 1098 struct vm_phys_seg *seg; 1099 vm_paddr_t pa; 1100 vm_page_t m_buddy; 1101 1102 KASSERT(m->order == VM_NFREEORDER, 1103 ("vm_phys_free_pages: page %p has unexpected order %d", 1104 m, m->order)); 1105 KASSERT(m->pool < VM_NFREEPOOL, 1106 ("vm_phys_free_pages: page %p has unexpected pool %d", 1107 m, m->pool)); 1108 KASSERT(order < VM_NFREEORDER, 1109 ("vm_phys_free_pages: order %d is out of range", order)); 1110 seg = &vm_phys_segs[m->segind]; 1111 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1112 if (order < VM_NFREEORDER - 1) { 1113 pa = VM_PAGE_TO_PHYS(m); 1114 do { 1115 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1116 if (pa < seg->start || pa >= seg->end) 1117 break; 1118 m_buddy = &seg->first_page[atop(pa - seg->start)]; 1119 if (m_buddy->order != order) 1120 break; 1121 fl = (*seg->free_queues)[m_buddy->pool]; 1122 vm_freelist_rem(fl, m_buddy, order); 1123 if (m_buddy->pool != m->pool) 1124 vm_phys_set_pool(m->pool, m_buddy, order); 1125 order++; 1126 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1127 m = &seg->first_page[atop(pa - seg->start)]; 1128 } while (order < VM_NFREEORDER - 1); 1129 } 1130 fl = (*seg->free_queues)[m->pool]; 1131 vm_freelist_add(fl, m, order, 1); 1132 } 1133 1134 /* 1135 * Return the largest possible order of a set of pages starting at m. 1136 */ 1137 static int 1138 max_order(vm_page_t m) 1139 { 1140 1141 /* 1142 * Unsigned "min" is used here so that "order" is assigned 1143 * "VM_NFREEORDER - 1" when "m"'s physical address is zero 1144 * or the low-order bits of its physical address are zero 1145 * because the size of a physical address exceeds the size of 1146 * a long. 1147 */ 1148 return (min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, 1149 VM_NFREEORDER - 1)); 1150 } 1151 1152 /* 1153 * Free a contiguous, arbitrarily sized set of physical pages, without 1154 * merging across set boundaries. 1155 * 1156 * The free page queues must be locked. 1157 */ 1158 void 1159 vm_phys_enqueue_contig(vm_page_t m, u_long npages) 1160 { 1161 struct vm_freelist *fl; 1162 struct vm_phys_seg *seg; 1163 vm_page_t m_end; 1164 int order; 1165 1166 /* 1167 * Avoid unnecessary coalescing by freeing the pages in the largest 1168 * possible power-of-two-sized subsets. 1169 */ 1170 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1171 seg = &vm_phys_segs[m->segind]; 1172 fl = (*seg->free_queues)[m->pool]; 1173 m_end = m + npages; 1174 /* Free blocks of increasing size. */ 1175 while ((order = max_order(m)) < VM_NFREEORDER - 1 && 1176 m + (1 << order) <= m_end) { 1177 KASSERT(seg == &vm_phys_segs[m->segind], 1178 ("%s: page range [%p,%p) spans multiple segments", 1179 __func__, m_end - npages, m)); 1180 vm_freelist_add(fl, m, order, 1); 1181 m += 1 << order; 1182 } 1183 /* Free blocks of maximum size. */ 1184 while (m + (1 << order) <= m_end) { 1185 KASSERT(seg == &vm_phys_segs[m->segind], 1186 ("%s: page range [%p,%p) spans multiple segments", 1187 __func__, m_end - npages, m)); 1188 vm_freelist_add(fl, m, order, 1); 1189 m += 1 << order; 1190 } 1191 /* Free blocks of diminishing size. */ 1192 while (m < m_end) { 1193 KASSERT(seg == &vm_phys_segs[m->segind], 1194 ("%s: page range [%p,%p) spans multiple segments", 1195 __func__, m_end - npages, m)); 1196 order = flsl(m_end - m) - 1; 1197 vm_freelist_add(fl, m, order, 1); 1198 m += 1 << order; 1199 } 1200 } 1201 1202 /* 1203 * Free a contiguous, arbitrarily sized set of physical pages. 1204 * 1205 * The free page queues must be locked. 1206 */ 1207 void 1208 vm_phys_free_contig(vm_page_t m, u_long npages) 1209 { 1210 int order_start, order_end; 1211 vm_page_t m_start, m_end; 1212 1213 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1214 1215 m_start = m; 1216 order_start = max_order(m_start); 1217 if (order_start < VM_NFREEORDER - 1) 1218 m_start += 1 << order_start; 1219 m_end = m + npages; 1220 order_end = max_order(m_end); 1221 if (order_end < VM_NFREEORDER - 1) 1222 m_end -= 1 << order_end; 1223 /* 1224 * Avoid unnecessary coalescing by freeing the pages at the start and 1225 * end of the range last. 1226 */ 1227 if (m_start < m_end) 1228 vm_phys_enqueue_contig(m_start, m_end - m_start); 1229 if (order_start < VM_NFREEORDER - 1) 1230 vm_phys_free_pages(m, order_start); 1231 if (order_end < VM_NFREEORDER - 1) 1232 vm_phys_free_pages(m_end, order_end); 1233 } 1234 1235 /* 1236 * Scan physical memory between the specified addresses "low" and "high" for a 1237 * run of contiguous physical pages that satisfy the specified conditions, and 1238 * return the lowest page in the run. The specified "alignment" determines 1239 * the alignment of the lowest physical page in the run. If the specified 1240 * "boundary" is non-zero, then the run of physical pages cannot span a 1241 * physical address that is a multiple of "boundary". 1242 * 1243 * "npages" must be greater than zero. Both "alignment" and "boundary" must 1244 * be a power of two. 1245 */ 1246 vm_page_t 1247 vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1248 u_long alignment, vm_paddr_t boundary, int options) 1249 { 1250 vm_paddr_t pa_end; 1251 vm_page_t m_end, m_run, m_start; 1252 struct vm_phys_seg *seg; 1253 int segind; 1254 1255 KASSERT(npages > 0, ("npages is 0")); 1256 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1257 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1258 if (low >= high) 1259 return (NULL); 1260 for (segind = 0; segind < vm_phys_nsegs; segind++) { 1261 seg = &vm_phys_segs[segind]; 1262 if (seg->domain != domain) 1263 continue; 1264 if (seg->start >= high) 1265 break; 1266 if (low >= seg->end) 1267 continue; 1268 if (low <= seg->start) 1269 m_start = seg->first_page; 1270 else 1271 m_start = &seg->first_page[atop(low - seg->start)]; 1272 if (high < seg->end) 1273 pa_end = high; 1274 else 1275 pa_end = seg->end; 1276 if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) 1277 continue; 1278 m_end = &seg->first_page[atop(pa_end - seg->start)]; 1279 m_run = vm_page_scan_contig(npages, m_start, m_end, 1280 alignment, boundary, options); 1281 if (m_run != NULL) 1282 return (m_run); 1283 } 1284 return (NULL); 1285 } 1286 1287 /* 1288 * Search for the given physical page "m" in the free lists. If the search 1289 * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return 1290 * FALSE, indicating that "m" is not in the free lists. 1291 * 1292 * The free page queues must be locked. 1293 */ 1294 boolean_t 1295 vm_phys_unfree_page(vm_page_t m) 1296 { 1297 struct vm_freelist *fl; 1298 struct vm_phys_seg *seg; 1299 vm_paddr_t pa, pa_half; 1300 vm_page_t m_set, m_tmp; 1301 int order; 1302 1303 /* 1304 * First, find the contiguous, power of two-sized set of free 1305 * physical pages containing the given physical page "m" and 1306 * assign it to "m_set". 1307 */ 1308 seg = &vm_phys_segs[m->segind]; 1309 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1310 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1311 order < VM_NFREEORDER - 1; ) { 1312 order++; 1313 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1314 if (pa >= seg->start) 1315 m_set = &seg->first_page[atop(pa - seg->start)]; 1316 else 1317 return (FALSE); 1318 } 1319 if (m_set->order < order) 1320 return (FALSE); 1321 if (m_set->order == VM_NFREEORDER) 1322 return (FALSE); 1323 KASSERT(m_set->order < VM_NFREEORDER, 1324 ("vm_phys_unfree_page: page %p has unexpected order %d", 1325 m_set, m_set->order)); 1326 1327 /* 1328 * Next, remove "m_set" from the free lists. Finally, extract 1329 * "m" from "m_set" using an iterative algorithm: While "m_set" 1330 * is larger than a page, shrink "m_set" by returning the half 1331 * of "m_set" that does not contain "m" to the free lists. 1332 */ 1333 fl = (*seg->free_queues)[m_set->pool]; 1334 order = m_set->order; 1335 vm_freelist_rem(fl, m_set, order); 1336 while (order > 0) { 1337 order--; 1338 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1339 if (m->phys_addr < pa_half) 1340 m_tmp = &seg->first_page[atop(pa_half - seg->start)]; 1341 else { 1342 m_tmp = m_set; 1343 m_set = &seg->first_page[atop(pa_half - seg->start)]; 1344 } 1345 vm_freelist_add(fl, m_tmp, order, 0); 1346 } 1347 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1348 return (TRUE); 1349 } 1350 1351 /* 1352 * Allocate a contiguous set of physical pages of the given size 1353 * "npages" from the free lists. All of the physical pages must be at 1354 * or above the given physical address "low" and below the given 1355 * physical address "high". The given value "alignment" determines the 1356 * alignment of the first physical page in the set. If the given value 1357 * "boundary" is non-zero, then the set of physical pages cannot cross 1358 * any physical address boundary that is a multiple of that value. Both 1359 * "alignment" and "boundary" must be a power of two. 1360 */ 1361 vm_page_t 1362 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1363 u_long alignment, vm_paddr_t boundary) 1364 { 1365 vm_paddr_t pa_end, pa_start; 1366 vm_page_t m_run; 1367 struct vm_phys_seg *seg; 1368 int segind; 1369 1370 KASSERT(npages > 0, ("npages is 0")); 1371 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1372 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1373 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1374 if (low >= high) 1375 return (NULL); 1376 m_run = NULL; 1377 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1378 seg = &vm_phys_segs[segind]; 1379 if (seg->start >= high || seg->domain != domain) 1380 continue; 1381 if (low >= seg->end) 1382 break; 1383 if (low <= seg->start) 1384 pa_start = seg->start; 1385 else 1386 pa_start = low; 1387 if (high < seg->end) 1388 pa_end = high; 1389 else 1390 pa_end = seg->end; 1391 if (pa_end - pa_start < ptoa(npages)) 1392 continue; 1393 m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, 1394 alignment, boundary); 1395 if (m_run != NULL) 1396 break; 1397 } 1398 return (m_run); 1399 } 1400 1401 /* 1402 * Allocate a run of contiguous physical pages from the free list for the 1403 * specified segment. 1404 */ 1405 static vm_page_t 1406 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, 1407 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1408 { 1409 struct vm_freelist *fl; 1410 vm_paddr_t pa, pa_end, size; 1411 vm_page_t m, m_ret; 1412 u_long npages_end; 1413 int oind, order, pind; 1414 1415 KASSERT(npages > 0, ("npages is 0")); 1416 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1417 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1418 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1419 /* Compute the queue that is the best fit for npages. */ 1420 order = flsl(npages - 1); 1421 /* Search for a run satisfying the specified conditions. */ 1422 size = npages << PAGE_SHIFT; 1423 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; 1424 oind++) { 1425 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1426 fl = (*seg->free_queues)[pind]; 1427 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1428 /* 1429 * Is the size of this allocation request 1430 * larger than the largest block size? 1431 */ 1432 if (order >= VM_NFREEORDER) { 1433 /* 1434 * Determine if a sufficient number of 1435 * subsequent blocks to satisfy the 1436 * allocation request are free. 1437 */ 1438 pa = VM_PAGE_TO_PHYS(m_ret); 1439 pa_end = pa + size; 1440 if (pa_end < pa) 1441 continue; 1442 for (;;) { 1443 pa += 1 << (PAGE_SHIFT + 1444 VM_NFREEORDER - 1); 1445 if (pa >= pa_end || 1446 pa < seg->start || 1447 pa >= seg->end) 1448 break; 1449 m = &seg->first_page[atop(pa - 1450 seg->start)]; 1451 if (m->order != VM_NFREEORDER - 1452 1) 1453 break; 1454 } 1455 /* If not, go to the next block. */ 1456 if (pa < pa_end) 1457 continue; 1458 } 1459 1460 /* 1461 * Determine if the blocks are within the 1462 * given range, satisfy the given alignment, 1463 * and do not cross the given boundary. 1464 */ 1465 pa = VM_PAGE_TO_PHYS(m_ret); 1466 pa_end = pa + size; 1467 if (pa >= low && pa_end <= high && 1468 (pa & (alignment - 1)) == 0 && 1469 rounddown2(pa ^ (pa_end - 1), boundary) == 0) 1470 goto done; 1471 } 1472 } 1473 } 1474 return (NULL); 1475 done: 1476 for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { 1477 fl = (*seg->free_queues)[m->pool]; 1478 vm_freelist_rem(fl, m, oind); 1479 if (m->pool != VM_FREEPOOL_DEFAULT) 1480 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1481 } 1482 /* Return excess pages to the free lists. */ 1483 npages_end = roundup2(npages, 1 << oind); 1484 if (npages < npages_end) { 1485 fl = (*seg->free_queues)[VM_FREEPOOL_DEFAULT]; 1486 vm_phys_enq_range(&m_ret[npages], npages_end - npages, fl, 0); 1487 } 1488 return (m_ret); 1489 } 1490 1491 /* 1492 * Return the index of the first unused slot which may be the terminating 1493 * entry. 1494 */ 1495 static int 1496 vm_phys_avail_count(void) 1497 { 1498 int i; 1499 1500 for (i = 0; phys_avail[i + 1]; i += 2) 1501 continue; 1502 if (i > PHYS_AVAIL_ENTRIES) 1503 panic("Improperly terminated phys_avail %d entries", i); 1504 1505 return (i); 1506 } 1507 1508 /* 1509 * Assert that a phys_avail entry is valid. 1510 */ 1511 static void 1512 vm_phys_avail_check(int i) 1513 { 1514 if (phys_avail[i] & PAGE_MASK) 1515 panic("Unaligned phys_avail[%d]: %#jx", i, 1516 (intmax_t)phys_avail[i]); 1517 if (phys_avail[i+1] & PAGE_MASK) 1518 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1519 (intmax_t)phys_avail[i]); 1520 if (phys_avail[i + 1] < phys_avail[i]) 1521 panic("phys_avail[%d] start %#jx < end %#jx", i, 1522 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1523 } 1524 1525 /* 1526 * Return the index of an overlapping phys_avail entry or -1. 1527 */ 1528 #ifdef NUMA 1529 static int 1530 vm_phys_avail_find(vm_paddr_t pa) 1531 { 1532 int i; 1533 1534 for (i = 0; phys_avail[i + 1]; i += 2) 1535 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1536 return (i); 1537 return (-1); 1538 } 1539 #endif 1540 1541 /* 1542 * Return the index of the largest entry. 1543 */ 1544 int 1545 vm_phys_avail_largest(void) 1546 { 1547 vm_paddr_t sz, largesz; 1548 int largest; 1549 int i; 1550 1551 largest = 0; 1552 largesz = 0; 1553 for (i = 0; phys_avail[i + 1]; i += 2) { 1554 sz = vm_phys_avail_size(i); 1555 if (sz > largesz) { 1556 largesz = sz; 1557 largest = i; 1558 } 1559 } 1560 1561 return (largest); 1562 } 1563 1564 vm_paddr_t 1565 vm_phys_avail_size(int i) 1566 { 1567 1568 return (phys_avail[i + 1] - phys_avail[i]); 1569 } 1570 1571 /* 1572 * Split an entry at the address 'pa'. Return zero on success or errno. 1573 */ 1574 static int 1575 vm_phys_avail_split(vm_paddr_t pa, int i) 1576 { 1577 int cnt; 1578 1579 vm_phys_avail_check(i); 1580 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1581 panic("vm_phys_avail_split: invalid address"); 1582 cnt = vm_phys_avail_count(); 1583 if (cnt >= PHYS_AVAIL_ENTRIES) 1584 return (ENOSPC); 1585 memmove(&phys_avail[i + 2], &phys_avail[i], 1586 (cnt - i) * sizeof(phys_avail[0])); 1587 phys_avail[i + 1] = pa; 1588 phys_avail[i + 2] = pa; 1589 vm_phys_avail_check(i); 1590 vm_phys_avail_check(i+2); 1591 1592 return (0); 1593 } 1594 1595 /* 1596 * Check if a given physical address can be included as part of a crash dump. 1597 */ 1598 bool 1599 vm_phys_is_dumpable(vm_paddr_t pa) 1600 { 1601 vm_page_t m; 1602 int i; 1603 1604 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1605 return ((m->flags & PG_NODUMP) == 0); 1606 1607 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1608 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1609 return (true); 1610 } 1611 return (false); 1612 } 1613 1614 void 1615 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1616 { 1617 struct vm_phys_seg *seg; 1618 1619 if (vm_phys_early_nsegs == -1) 1620 panic("%s: called after initialization", __func__); 1621 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1622 panic("%s: ran out of early segments", __func__); 1623 1624 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1625 seg->start = start; 1626 seg->end = end; 1627 } 1628 1629 /* 1630 * This routine allocates NUMA node specific memory before the page 1631 * allocator is bootstrapped. 1632 */ 1633 vm_paddr_t 1634 vm_phys_early_alloc(int domain, size_t alloc_size) 1635 { 1636 int i, mem_index, biggestone; 1637 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1638 1639 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1640 ("%s: invalid domain index %d", __func__, domain)); 1641 1642 /* 1643 * Search the mem_affinity array for the biggest address 1644 * range in the desired domain. This is used to constrain 1645 * the phys_avail selection below. 1646 */ 1647 biggestsize = 0; 1648 mem_index = 0; 1649 mem_start = 0; 1650 mem_end = -1; 1651 #ifdef NUMA 1652 if (mem_affinity != NULL) { 1653 for (i = 0;; i++) { 1654 size = mem_affinity[i].end - mem_affinity[i].start; 1655 if (size == 0) 1656 break; 1657 if (domain != -1 && mem_affinity[i].domain != domain) 1658 continue; 1659 if (size > biggestsize) { 1660 mem_index = i; 1661 biggestsize = size; 1662 } 1663 } 1664 mem_start = mem_affinity[mem_index].start; 1665 mem_end = mem_affinity[mem_index].end; 1666 } 1667 #endif 1668 1669 /* 1670 * Now find biggest physical segment in within the desired 1671 * numa domain. 1672 */ 1673 biggestsize = 0; 1674 biggestone = 0; 1675 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1676 /* skip regions that are out of range */ 1677 if (phys_avail[i+1] - alloc_size < mem_start || 1678 phys_avail[i+1] > mem_end) 1679 continue; 1680 size = vm_phys_avail_size(i); 1681 if (size > biggestsize) { 1682 biggestone = i; 1683 biggestsize = size; 1684 } 1685 } 1686 alloc_size = round_page(alloc_size); 1687 1688 /* 1689 * Grab single pages from the front to reduce fragmentation. 1690 */ 1691 if (alloc_size == PAGE_SIZE) { 1692 pa = phys_avail[biggestone]; 1693 phys_avail[biggestone] += PAGE_SIZE; 1694 vm_phys_avail_check(biggestone); 1695 return (pa); 1696 } 1697 1698 /* 1699 * Naturally align large allocations. 1700 */ 1701 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1702 if (alloc_size + align > biggestsize) 1703 panic("cannot find a large enough size\n"); 1704 if (align != 0 && 1705 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1706 biggestone) != 0) 1707 /* Wasting memory. */ 1708 phys_avail[biggestone + 1] -= align; 1709 1710 phys_avail[biggestone + 1] -= alloc_size; 1711 vm_phys_avail_check(biggestone); 1712 pa = phys_avail[biggestone + 1]; 1713 return (pa); 1714 } 1715 1716 void 1717 vm_phys_early_startup(void) 1718 { 1719 struct vm_phys_seg *seg; 1720 int i; 1721 1722 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1723 phys_avail[i] = round_page(phys_avail[i]); 1724 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1725 } 1726 1727 for (i = 0; i < vm_phys_early_nsegs; i++) { 1728 seg = &vm_phys_early_segs[i]; 1729 vm_phys_add_seg(seg->start, seg->end); 1730 } 1731 vm_phys_early_nsegs = -1; 1732 1733 #ifdef NUMA 1734 /* Force phys_avail to be split by domain. */ 1735 if (mem_affinity != NULL) { 1736 int idx; 1737 1738 for (i = 0; mem_affinity[i].end != 0; i++) { 1739 idx = vm_phys_avail_find(mem_affinity[i].start); 1740 if (idx != -1 && 1741 phys_avail[idx] != mem_affinity[i].start) 1742 vm_phys_avail_split(mem_affinity[i].start, idx); 1743 idx = vm_phys_avail_find(mem_affinity[i].end); 1744 if (idx != -1 && 1745 phys_avail[idx] != mem_affinity[i].end) 1746 vm_phys_avail_split(mem_affinity[i].end, idx); 1747 } 1748 } 1749 #endif 1750 } 1751 1752 #ifdef DDB 1753 /* 1754 * Show the number of physical pages in each of the free lists. 1755 */ 1756 DB_SHOW_COMMAND(freepages, db_show_freepages) 1757 { 1758 struct vm_freelist *fl; 1759 int flind, oind, pind, dom; 1760 1761 for (dom = 0; dom < vm_ndomains; dom++) { 1762 db_printf("DOMAIN: %d\n", dom); 1763 for (flind = 0; flind < vm_nfreelists; flind++) { 1764 db_printf("FREE LIST %d:\n" 1765 "\n ORDER (SIZE) | NUMBER" 1766 "\n ", flind); 1767 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1768 db_printf(" | POOL %d", pind); 1769 db_printf("\n-- "); 1770 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1771 db_printf("-- -- "); 1772 db_printf("--\n"); 1773 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 1774 db_printf(" %2.2d (%6.6dK)", oind, 1775 1 << (PAGE_SHIFT - 10 + oind)); 1776 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1777 fl = vm_phys_free_queues[dom][flind][pind]; 1778 db_printf(" | %6.6d", fl[oind].lcnt); 1779 } 1780 db_printf("\n"); 1781 } 1782 db_printf("\n"); 1783 } 1784 db_printf("\n"); 1785 } 1786 } 1787 #endif 1788