1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_vm.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/domainset.h> 48 #include <sys/lock.h> 49 #include <sys/kernel.h> 50 #include <sys/malloc.h> 51 #include <sys/mutex.h> 52 #include <sys/proc.h> 53 #include <sys/queue.h> 54 #include <sys/rwlock.h> 55 #include <sys/sbuf.h> 56 #include <sys/sysctl.h> 57 #include <sys/tree.h> 58 #include <sys/vmmeter.h> 59 60 #include <ddb/ddb.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_extern.h> 64 #include <vm/vm_param.h> 65 #include <vm/vm_kern.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_phys.h> 69 #include <vm/vm_pagequeue.h> 70 71 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 72 "Too many physsegs."); 73 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t), 74 "vm_paddr_t too big for ffsll, flsll."); 75 76 #ifdef NUMA 77 struct mem_affinity __read_mostly *mem_affinity; 78 int __read_mostly *mem_locality; 79 80 static int numa_disabled; 81 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 82 "NUMA options"); 83 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 84 &numa_disabled, 0, "NUMA-awareness in the allocators is disabled"); 85 #endif 86 87 int __read_mostly vm_ndomains = 1; 88 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 89 90 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 91 int __read_mostly vm_phys_nsegs; 92 static struct vm_phys_seg vm_phys_early_segs[8]; 93 static int vm_phys_early_nsegs; 94 95 struct vm_phys_fictitious_seg; 96 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 97 struct vm_phys_fictitious_seg *); 98 99 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 100 RB_INITIALIZER(&vm_phys_fictitious_tree); 101 102 struct vm_phys_fictitious_seg { 103 RB_ENTRY(vm_phys_fictitious_seg) node; 104 /* Memory region data */ 105 vm_paddr_t start; 106 vm_paddr_t end; 107 vm_page_t first_page; 108 }; 109 110 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 111 vm_phys_fictitious_cmp); 112 113 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 114 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 115 116 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 117 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 118 [VM_NFREEORDER_MAX]; 119 120 static int __read_mostly vm_nfreelists; 121 122 /* 123 * These "avail lists" are globals used to communicate boot-time physical 124 * memory layout to other parts of the kernel. Each physically contiguous 125 * region of memory is defined by a start address at an even index and an 126 * end address at the following odd index. Each list is terminated by a 127 * pair of zero entries. 128 * 129 * dump_avail tells the dump code what regions to include in a crash dump, and 130 * phys_avail is all of the remaining physical memory that is available for 131 * the vm system. 132 * 133 * Initially dump_avail and phys_avail are identical. Boot time memory 134 * allocations remove extents from phys_avail that may still be included 135 * in dumps. 136 */ 137 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 138 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 139 140 /* 141 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 142 */ 143 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 144 145 CTASSERT(VM_FREELIST_DEFAULT == 0); 146 147 #ifdef VM_FREELIST_DMA32 148 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 149 #endif 150 151 /* 152 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 153 * the ordering of the free list boundaries. 154 */ 155 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 156 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 157 #endif 158 159 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 160 SYSCTL_OID(_vm, OID_AUTO, phys_free, 161 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 162 sysctl_vm_phys_free, "A", 163 "Phys Free Info"); 164 165 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 166 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 167 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 168 sysctl_vm_phys_segs, "A", 169 "Phys Seg Info"); 170 171 #ifdef NUMA 172 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 173 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 174 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 175 sysctl_vm_phys_locality, "A", 176 "Phys Locality Info"); 177 #endif 178 179 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 180 &vm_ndomains, 0, "Number of physical memory domains available."); 181 182 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 183 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 184 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 185 int order, int tail); 186 187 /* 188 * Red-black tree helpers for vm fictitious range management. 189 */ 190 static inline int 191 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 192 struct vm_phys_fictitious_seg *range) 193 { 194 195 KASSERT(range->start != 0 && range->end != 0, 196 ("Invalid range passed on search for vm_fictitious page")); 197 if (p->start >= range->end) 198 return (1); 199 if (p->start < range->start) 200 return (-1); 201 202 return (0); 203 } 204 205 static int 206 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 207 struct vm_phys_fictitious_seg *p2) 208 { 209 210 /* Check if this is a search for a page */ 211 if (p1->end == 0) 212 return (vm_phys_fictitious_in_range(p1, p2)); 213 214 KASSERT(p2->end != 0, 215 ("Invalid range passed as second parameter to vm fictitious comparison")); 216 217 /* Searching to add a new range */ 218 if (p1->end <= p2->start) 219 return (-1); 220 if (p1->start >= p2->end) 221 return (1); 222 223 panic("Trying to add overlapping vm fictitious ranges:\n" 224 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 225 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 226 } 227 228 int 229 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high) 230 { 231 #ifdef NUMA 232 domainset_t mask; 233 int i; 234 235 if (vm_ndomains == 1 || mem_affinity == NULL) 236 return (0); 237 238 DOMAINSET_ZERO(&mask); 239 /* 240 * Check for any memory that overlaps low, high. 241 */ 242 for (i = 0; mem_affinity[i].end != 0; i++) 243 if (mem_affinity[i].start <= high && 244 mem_affinity[i].end >= low) 245 DOMAINSET_SET(mem_affinity[i].domain, &mask); 246 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 247 return (prefer); 248 if (DOMAINSET_EMPTY(&mask)) 249 panic("vm_phys_domain_match: Impossible constraint"); 250 return (DOMAINSET_FFS(&mask) - 1); 251 #else 252 return (0); 253 #endif 254 } 255 256 /* 257 * Outputs the state of the physical memory allocator, specifically, 258 * the amount of physical memory in each free list. 259 */ 260 static int 261 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 262 { 263 struct sbuf sbuf; 264 struct vm_freelist *fl; 265 int dom, error, flind, oind, pind; 266 267 error = sysctl_wire_old_buffer(req, 0); 268 if (error != 0) 269 return (error); 270 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 271 for (dom = 0; dom < vm_ndomains; dom++) { 272 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 273 for (flind = 0; flind < vm_nfreelists; flind++) { 274 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 275 "\n ORDER (SIZE) | NUMBER" 276 "\n ", flind); 277 for (pind = 0; pind < VM_NFREEPOOL; pind++) 278 sbuf_printf(&sbuf, " | POOL %d", pind); 279 sbuf_printf(&sbuf, "\n-- "); 280 for (pind = 0; pind < VM_NFREEPOOL; pind++) 281 sbuf_printf(&sbuf, "-- -- "); 282 sbuf_printf(&sbuf, "--\n"); 283 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 284 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 285 1 << (PAGE_SHIFT - 10 + oind)); 286 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 287 fl = vm_phys_free_queues[dom][flind][pind]; 288 sbuf_printf(&sbuf, " | %6d", 289 fl[oind].lcnt); 290 } 291 sbuf_printf(&sbuf, "\n"); 292 } 293 } 294 } 295 error = sbuf_finish(&sbuf); 296 sbuf_delete(&sbuf); 297 return (error); 298 } 299 300 /* 301 * Outputs the set of physical memory segments. 302 */ 303 static int 304 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 305 { 306 struct sbuf sbuf; 307 struct vm_phys_seg *seg; 308 int error, segind; 309 310 error = sysctl_wire_old_buffer(req, 0); 311 if (error != 0) 312 return (error); 313 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 314 for (segind = 0; segind < vm_phys_nsegs; segind++) { 315 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 316 seg = &vm_phys_segs[segind]; 317 sbuf_printf(&sbuf, "start: %#jx\n", 318 (uintmax_t)seg->start); 319 sbuf_printf(&sbuf, "end: %#jx\n", 320 (uintmax_t)seg->end); 321 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 322 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 323 } 324 error = sbuf_finish(&sbuf); 325 sbuf_delete(&sbuf); 326 return (error); 327 } 328 329 /* 330 * Return affinity, or -1 if there's no affinity information. 331 */ 332 int 333 vm_phys_mem_affinity(int f, int t) 334 { 335 336 #ifdef NUMA 337 if (mem_locality == NULL) 338 return (-1); 339 if (f >= vm_ndomains || t >= vm_ndomains) 340 return (-1); 341 return (mem_locality[f * vm_ndomains + t]); 342 #else 343 return (-1); 344 #endif 345 } 346 347 #ifdef NUMA 348 /* 349 * Outputs the VM locality table. 350 */ 351 static int 352 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 353 { 354 struct sbuf sbuf; 355 int error, i, j; 356 357 error = sysctl_wire_old_buffer(req, 0); 358 if (error != 0) 359 return (error); 360 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 361 362 sbuf_printf(&sbuf, "\n"); 363 364 for (i = 0; i < vm_ndomains; i++) { 365 sbuf_printf(&sbuf, "%d: ", i); 366 for (j = 0; j < vm_ndomains; j++) { 367 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 368 } 369 sbuf_printf(&sbuf, "\n"); 370 } 371 error = sbuf_finish(&sbuf); 372 sbuf_delete(&sbuf); 373 return (error); 374 } 375 #endif 376 377 static void 378 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 379 { 380 381 m->order = order; 382 if (tail) 383 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 384 else 385 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 386 fl[order].lcnt++; 387 } 388 389 static void 390 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 391 { 392 393 TAILQ_REMOVE(&fl[order].pl, m, listq); 394 fl[order].lcnt--; 395 m->order = VM_NFREEORDER; 396 } 397 398 /* 399 * Create a physical memory segment. 400 */ 401 static void 402 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 403 { 404 struct vm_phys_seg *seg; 405 406 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 407 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 408 KASSERT(domain >= 0 && domain < vm_ndomains, 409 ("vm_phys_create_seg: invalid domain provided")); 410 seg = &vm_phys_segs[vm_phys_nsegs++]; 411 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 412 *seg = *(seg - 1); 413 seg--; 414 } 415 seg->start = start; 416 seg->end = end; 417 seg->domain = domain; 418 } 419 420 static void 421 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 422 { 423 #ifdef NUMA 424 int i; 425 426 if (mem_affinity == NULL) { 427 _vm_phys_create_seg(start, end, 0); 428 return; 429 } 430 431 for (i = 0;; i++) { 432 if (mem_affinity[i].end == 0) 433 panic("Reached end of affinity info"); 434 if (mem_affinity[i].end <= start) 435 continue; 436 if (mem_affinity[i].start > start) 437 panic("No affinity info for start %jx", 438 (uintmax_t)start); 439 if (mem_affinity[i].end >= end) { 440 _vm_phys_create_seg(start, end, 441 mem_affinity[i].domain); 442 break; 443 } 444 _vm_phys_create_seg(start, mem_affinity[i].end, 445 mem_affinity[i].domain); 446 start = mem_affinity[i].end; 447 } 448 #else 449 _vm_phys_create_seg(start, end, 0); 450 #endif 451 } 452 453 /* 454 * Add a physical memory segment. 455 */ 456 void 457 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 458 { 459 vm_paddr_t paddr; 460 461 KASSERT((start & PAGE_MASK) == 0, 462 ("vm_phys_define_seg: start is not page aligned")); 463 KASSERT((end & PAGE_MASK) == 0, 464 ("vm_phys_define_seg: end is not page aligned")); 465 466 /* 467 * Split the physical memory segment if it spans two or more free 468 * list boundaries. 469 */ 470 paddr = start; 471 #ifdef VM_FREELIST_LOWMEM 472 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 473 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 474 paddr = VM_LOWMEM_BOUNDARY; 475 } 476 #endif 477 #ifdef VM_FREELIST_DMA32 478 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 479 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 480 paddr = VM_DMA32_BOUNDARY; 481 } 482 #endif 483 vm_phys_create_seg(paddr, end); 484 } 485 486 /* 487 * Initialize the physical memory allocator. 488 * 489 * Requires that vm_page_array is initialized! 490 */ 491 void 492 vm_phys_init(void) 493 { 494 struct vm_freelist *fl; 495 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 496 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) 497 u_long npages; 498 #endif 499 int dom, flind, freelist, oind, pind, segind; 500 501 /* 502 * Compute the number of free lists, and generate the mapping from the 503 * manifest constants VM_FREELIST_* to the free list indices. 504 * 505 * Initially, the entries of vm_freelist_to_flind[] are set to either 506 * 0 or 1 to indicate which free lists should be created. 507 */ 508 #ifdef VM_DMA32_NPAGES_THRESHOLD 509 npages = 0; 510 #endif 511 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 512 seg = &vm_phys_segs[segind]; 513 #ifdef VM_FREELIST_LOWMEM 514 if (seg->end <= VM_LOWMEM_BOUNDARY) 515 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 516 else 517 #endif 518 #ifdef VM_FREELIST_DMA32 519 if ( 520 #ifdef VM_DMA32_NPAGES_THRESHOLD 521 /* 522 * Create the DMA32 free list only if the amount of 523 * physical memory above physical address 4G exceeds the 524 * given threshold. 525 */ 526 npages > VM_DMA32_NPAGES_THRESHOLD && 527 #endif 528 seg->end <= VM_DMA32_BOUNDARY) 529 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 530 else 531 #endif 532 { 533 #ifdef VM_DMA32_NPAGES_THRESHOLD 534 npages += atop(seg->end - seg->start); 535 #endif 536 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 537 } 538 } 539 /* Change each entry into a running total of the free lists. */ 540 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 541 vm_freelist_to_flind[freelist] += 542 vm_freelist_to_flind[freelist - 1]; 543 } 544 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 545 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 546 /* Change each entry into a free list index. */ 547 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 548 vm_freelist_to_flind[freelist]--; 549 550 /* 551 * Initialize the first_page and free_queues fields of each physical 552 * memory segment. 553 */ 554 #ifdef VM_PHYSSEG_SPARSE 555 npages = 0; 556 #endif 557 for (segind = 0; segind < vm_phys_nsegs; segind++) { 558 seg = &vm_phys_segs[segind]; 559 #ifdef VM_PHYSSEG_SPARSE 560 seg->first_page = &vm_page_array[npages]; 561 npages += atop(seg->end - seg->start); 562 #else 563 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 564 #endif 565 #ifdef VM_FREELIST_LOWMEM 566 if (seg->end <= VM_LOWMEM_BOUNDARY) { 567 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 568 KASSERT(flind >= 0, 569 ("vm_phys_init: LOWMEM flind < 0")); 570 } else 571 #endif 572 #ifdef VM_FREELIST_DMA32 573 if (seg->end <= VM_DMA32_BOUNDARY) { 574 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 575 KASSERT(flind >= 0, 576 ("vm_phys_init: DMA32 flind < 0")); 577 } else 578 #endif 579 { 580 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 581 KASSERT(flind >= 0, 582 ("vm_phys_init: DEFAULT flind < 0")); 583 } 584 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 585 } 586 587 /* 588 * Coalesce physical memory segments that are contiguous and share the 589 * same per-domain free queues. 590 */ 591 prev_seg = vm_phys_segs; 592 seg = &vm_phys_segs[1]; 593 end_seg = &vm_phys_segs[vm_phys_nsegs]; 594 while (seg < end_seg) { 595 if (prev_seg->end == seg->start && 596 prev_seg->free_queues == seg->free_queues) { 597 prev_seg->end = seg->end; 598 KASSERT(prev_seg->domain == seg->domain, 599 ("vm_phys_init: free queues cannot span domains")); 600 vm_phys_nsegs--; 601 end_seg--; 602 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 603 *tmp_seg = *(tmp_seg + 1); 604 } else { 605 prev_seg = seg; 606 seg++; 607 } 608 } 609 610 /* 611 * Initialize the free queues. 612 */ 613 for (dom = 0; dom < vm_ndomains; dom++) { 614 for (flind = 0; flind < vm_nfreelists; flind++) { 615 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 616 fl = vm_phys_free_queues[dom][flind][pind]; 617 for (oind = 0; oind < VM_NFREEORDER; oind++) 618 TAILQ_INIT(&fl[oind].pl); 619 } 620 } 621 } 622 623 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 624 } 625 626 /* 627 * Register info about the NUMA topology of the system. 628 * 629 * Invoked by platform-dependent code prior to vm_phys_init(). 630 */ 631 void 632 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, 633 int *locality) 634 { 635 #ifdef NUMA 636 int i; 637 638 /* 639 * For now the only override value that we support is 1, which 640 * effectively disables NUMA-awareness in the allocators. 641 */ 642 TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled); 643 if (numa_disabled) 644 ndomains = 1; 645 646 if (ndomains > 1) { 647 vm_ndomains = ndomains; 648 mem_affinity = affinity; 649 mem_locality = locality; 650 } 651 652 for (i = 0; i < vm_ndomains; i++) 653 DOMAINSET_SET(i, &all_domains); 654 #else 655 (void)ndomains; 656 (void)affinity; 657 (void)locality; 658 #endif 659 } 660 661 /* 662 * Split a contiguous, power of two-sized set of physical pages. 663 * 664 * When this function is called by a page allocation function, the caller 665 * should request insertion at the head unless the order [order, oind) queues 666 * are known to be empty. The objective being to reduce the likelihood of 667 * long-term fragmentation by promoting contemporaneous allocation and 668 * (hopefully) deallocation. 669 */ 670 static __inline void 671 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 672 int tail) 673 { 674 vm_page_t m_buddy; 675 676 while (oind > order) { 677 oind--; 678 m_buddy = &m[1 << oind]; 679 KASSERT(m_buddy->order == VM_NFREEORDER, 680 ("vm_phys_split_pages: page %p has unexpected order %d", 681 m_buddy, m_buddy->order)); 682 vm_freelist_add(fl, m_buddy, oind, tail); 683 } 684 } 685 686 /* 687 * Add the physical pages [m, m + npages) at the beginning of a power-of-two 688 * aligned and sized set to the specified free list. 689 * 690 * When this function is called by a page allocation function, the caller 691 * should request insertion at the head unless the lower-order queues are 692 * known to be empty. The objective being to reduce the likelihood of long- 693 * term fragmentation by promoting contemporaneous allocation and (hopefully) 694 * deallocation. 695 * 696 * The physical page m's buddy must not be free. 697 */ 698 static void 699 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 700 { 701 int order; 702 703 KASSERT(npages == 0 || 704 (VM_PAGE_TO_PHYS(m) & 705 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 706 ("%s: page %p and npages %u are misaligned", 707 __func__, m, npages)); 708 while (npages > 0) { 709 KASSERT(m->order == VM_NFREEORDER, 710 ("%s: page %p has unexpected order %d", 711 __func__, m, m->order)); 712 order = fls(npages) - 1; 713 KASSERT(order < VM_NFREEORDER, 714 ("%s: order %d is out of range", __func__, order)); 715 vm_freelist_add(fl, m, order, tail); 716 m += 1 << order; 717 npages -= 1 << order; 718 } 719 } 720 721 /* 722 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 723 * and sized set to the specified free list. 724 * 725 * When this function is called by a page allocation function, the caller 726 * should request insertion at the head unless the lower-order queues are 727 * known to be empty. The objective being to reduce the likelihood of long- 728 * term fragmentation by promoting contemporaneous allocation and (hopefully) 729 * deallocation. 730 * 731 * If npages is zero, this function does nothing and ignores the physical page 732 * parameter m. Otherwise, the physical page m's buddy must not be free. 733 */ 734 static vm_page_t 735 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 736 { 737 int order; 738 739 KASSERT(npages == 0 || 740 ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 741 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 742 ("vm_phys_enq_range: page %p and npages %u are misaligned", 743 m, npages)); 744 while (npages > 0) { 745 KASSERT(m->order == VM_NFREEORDER, 746 ("vm_phys_enq_range: page %p has unexpected order %d", 747 m, m->order)); 748 order = ffs(npages) - 1; 749 KASSERT(order < VM_NFREEORDER, 750 ("vm_phys_enq_range: order %d is out of range", order)); 751 vm_freelist_add(fl, m, order, tail); 752 m += 1 << order; 753 npages -= 1 << order; 754 } 755 return (m); 756 } 757 758 /* 759 * Set the pool for a contiguous, power of two-sized set of physical pages. 760 */ 761 static void 762 vm_phys_set_pool(int pool, vm_page_t m, int order) 763 { 764 vm_page_t m_tmp; 765 766 for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 767 m_tmp->pool = pool; 768 } 769 770 /* 771 * Tries to allocate the specified number of pages from the specified pool 772 * within the specified domain. Returns the actual number of allocated pages 773 * and a pointer to each page through the array ma[]. 774 * 775 * The returned pages may not be physically contiguous. However, in contrast 776 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 777 * calling this function once to allocate the desired number of pages will 778 * avoid wasted time in vm_phys_split_pages(). 779 * 780 * The free page queues for the specified domain must be locked. 781 */ 782 int 783 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 784 { 785 struct vm_freelist *alt, *fl; 786 vm_page_t m; 787 int avail, end, flind, freelist, i, oind, pind; 788 789 KASSERT(domain >= 0 && domain < vm_ndomains, 790 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 791 KASSERT(pool < VM_NFREEPOOL, 792 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 793 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 794 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 795 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 796 i = 0; 797 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 798 flind = vm_freelist_to_flind[freelist]; 799 if (flind < 0) 800 continue; 801 fl = vm_phys_free_queues[domain][flind][pool]; 802 for (oind = 0; oind < VM_NFREEORDER; oind++) { 803 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 804 vm_freelist_rem(fl, m, oind); 805 avail = i + (1 << oind); 806 end = imin(npages, avail); 807 while (i < end) 808 ma[i++] = m++; 809 if (i == npages) { 810 /* 811 * Return excess pages to fl. Its order 812 * [0, oind) queues are empty. 813 */ 814 vm_phys_enq_range(m, avail - i, fl, 1); 815 return (npages); 816 } 817 } 818 } 819 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 820 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 821 alt = vm_phys_free_queues[domain][flind][pind]; 822 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 823 NULL) { 824 vm_freelist_rem(alt, m, oind); 825 vm_phys_set_pool(pool, m, oind); 826 avail = i + (1 << oind); 827 end = imin(npages, avail); 828 while (i < end) 829 ma[i++] = m++; 830 if (i == npages) { 831 /* 832 * Return excess pages to fl. 833 * Its order [0, oind) queues 834 * are empty. 835 */ 836 vm_phys_enq_range(m, avail - i, 837 fl, 1); 838 return (npages); 839 } 840 } 841 } 842 } 843 } 844 return (i); 845 } 846 847 /* 848 * Allocate a contiguous, power of two-sized set of physical pages 849 * from the free lists. 850 * 851 * The free page queues must be locked. 852 */ 853 vm_page_t 854 vm_phys_alloc_pages(int domain, int pool, int order) 855 { 856 vm_page_t m; 857 int freelist; 858 859 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 860 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 861 if (m != NULL) 862 return (m); 863 } 864 return (NULL); 865 } 866 867 /* 868 * Allocate a contiguous, power of two-sized set of physical pages from the 869 * specified free list. The free list must be specified using one of the 870 * manifest constants VM_FREELIST_*. 871 * 872 * The free page queues must be locked. 873 */ 874 vm_page_t 875 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 876 { 877 struct vm_freelist *alt, *fl; 878 vm_page_t m; 879 int oind, pind, flind; 880 881 KASSERT(domain >= 0 && domain < vm_ndomains, 882 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 883 domain)); 884 KASSERT(freelist < VM_NFREELIST, 885 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 886 freelist)); 887 KASSERT(pool < VM_NFREEPOOL, 888 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 889 KASSERT(order < VM_NFREEORDER, 890 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 891 892 flind = vm_freelist_to_flind[freelist]; 893 /* Check if freelist is present */ 894 if (flind < 0) 895 return (NULL); 896 897 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 898 fl = &vm_phys_free_queues[domain][flind][pool][0]; 899 for (oind = order; oind < VM_NFREEORDER; oind++) { 900 m = TAILQ_FIRST(&fl[oind].pl); 901 if (m != NULL) { 902 vm_freelist_rem(fl, m, oind); 903 /* The order [order, oind) queues are empty. */ 904 vm_phys_split_pages(m, oind, fl, order, 1); 905 return (m); 906 } 907 } 908 909 /* 910 * The given pool was empty. Find the largest 911 * contiguous, power-of-two-sized set of pages in any 912 * pool. Transfer these pages to the given pool, and 913 * use them to satisfy the allocation. 914 */ 915 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 916 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 917 alt = &vm_phys_free_queues[domain][flind][pind][0]; 918 m = TAILQ_FIRST(&alt[oind].pl); 919 if (m != NULL) { 920 vm_freelist_rem(alt, m, oind); 921 vm_phys_set_pool(pool, m, oind); 922 /* The order [order, oind) queues are empty. */ 923 vm_phys_split_pages(m, oind, fl, order, 1); 924 return (m); 925 } 926 } 927 } 928 return (NULL); 929 } 930 931 /* 932 * Find the vm_page corresponding to the given physical address. 933 */ 934 vm_page_t 935 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 936 { 937 struct vm_phys_seg *seg; 938 939 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 940 return (&seg->first_page[atop(pa - seg->start)]); 941 return (NULL); 942 } 943 944 vm_page_t 945 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 946 { 947 struct vm_phys_fictitious_seg tmp, *seg; 948 vm_page_t m; 949 950 m = NULL; 951 tmp.start = pa; 952 tmp.end = 0; 953 954 rw_rlock(&vm_phys_fictitious_reg_lock); 955 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 956 rw_runlock(&vm_phys_fictitious_reg_lock); 957 if (seg == NULL) 958 return (NULL); 959 960 m = &seg->first_page[atop(pa - seg->start)]; 961 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 962 963 return (m); 964 } 965 966 static inline void 967 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 968 long page_count, vm_memattr_t memattr) 969 { 970 long i; 971 972 bzero(range, page_count * sizeof(*range)); 973 for (i = 0; i < page_count; i++) { 974 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 975 range[i].oflags &= ~VPO_UNMANAGED; 976 range[i].busy_lock = VPB_UNBUSIED; 977 } 978 } 979 980 int 981 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 982 vm_memattr_t memattr) 983 { 984 struct vm_phys_fictitious_seg *seg; 985 vm_page_t fp; 986 long page_count; 987 #ifdef VM_PHYSSEG_DENSE 988 long pi, pe; 989 long dpage_count; 990 #endif 991 992 KASSERT(start < end, 993 ("Start of segment isn't less than end (start: %jx end: %jx)", 994 (uintmax_t)start, (uintmax_t)end)); 995 996 page_count = (end - start) / PAGE_SIZE; 997 998 #ifdef VM_PHYSSEG_DENSE 999 pi = atop(start); 1000 pe = atop(end); 1001 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1002 fp = &vm_page_array[pi - first_page]; 1003 if ((pe - first_page) > vm_page_array_size) { 1004 /* 1005 * We have a segment that starts inside 1006 * of vm_page_array, but ends outside of it. 1007 * 1008 * Use vm_page_array pages for those that are 1009 * inside of the vm_page_array range, and 1010 * allocate the remaining ones. 1011 */ 1012 dpage_count = vm_page_array_size - (pi - first_page); 1013 vm_phys_fictitious_init_range(fp, start, dpage_count, 1014 memattr); 1015 page_count -= dpage_count; 1016 start += ptoa(dpage_count); 1017 goto alloc; 1018 } 1019 /* 1020 * We can allocate the full range from vm_page_array, 1021 * so there's no need to register the range in the tree. 1022 */ 1023 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1024 return (0); 1025 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1026 /* 1027 * We have a segment that ends inside of vm_page_array, 1028 * but starts outside of it. 1029 */ 1030 fp = &vm_page_array[0]; 1031 dpage_count = pe - first_page; 1032 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 1033 memattr); 1034 end -= ptoa(dpage_count); 1035 page_count -= dpage_count; 1036 goto alloc; 1037 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1038 /* 1039 * Trying to register a fictitious range that expands before 1040 * and after vm_page_array. 1041 */ 1042 return (EINVAL); 1043 } else { 1044 alloc: 1045 #endif 1046 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1047 M_WAITOK); 1048 #ifdef VM_PHYSSEG_DENSE 1049 } 1050 #endif 1051 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1052 1053 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1054 seg->start = start; 1055 seg->end = end; 1056 seg->first_page = fp; 1057 1058 rw_wlock(&vm_phys_fictitious_reg_lock); 1059 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1060 rw_wunlock(&vm_phys_fictitious_reg_lock); 1061 1062 return (0); 1063 } 1064 1065 void 1066 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1067 { 1068 struct vm_phys_fictitious_seg *seg, tmp; 1069 #ifdef VM_PHYSSEG_DENSE 1070 long pi, pe; 1071 #endif 1072 1073 KASSERT(start < end, 1074 ("Start of segment isn't less than end (start: %jx end: %jx)", 1075 (uintmax_t)start, (uintmax_t)end)); 1076 1077 #ifdef VM_PHYSSEG_DENSE 1078 pi = atop(start); 1079 pe = atop(end); 1080 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1081 if ((pe - first_page) <= vm_page_array_size) { 1082 /* 1083 * This segment was allocated using vm_page_array 1084 * only, there's nothing to do since those pages 1085 * were never added to the tree. 1086 */ 1087 return; 1088 } 1089 /* 1090 * We have a segment that starts inside 1091 * of vm_page_array, but ends outside of it. 1092 * 1093 * Calculate how many pages were added to the 1094 * tree and free them. 1095 */ 1096 start = ptoa(first_page + vm_page_array_size); 1097 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1098 /* 1099 * We have a segment that ends inside of vm_page_array, 1100 * but starts outside of it. 1101 */ 1102 end = ptoa(first_page); 1103 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1104 /* Since it's not possible to register such a range, panic. */ 1105 panic( 1106 "Unregistering not registered fictitious range [%#jx:%#jx]", 1107 (uintmax_t)start, (uintmax_t)end); 1108 } 1109 #endif 1110 tmp.start = start; 1111 tmp.end = 0; 1112 1113 rw_wlock(&vm_phys_fictitious_reg_lock); 1114 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1115 if (seg->start != start || seg->end != end) { 1116 rw_wunlock(&vm_phys_fictitious_reg_lock); 1117 panic( 1118 "Unregistering not registered fictitious range [%#jx:%#jx]", 1119 (uintmax_t)start, (uintmax_t)end); 1120 } 1121 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1122 rw_wunlock(&vm_phys_fictitious_reg_lock); 1123 free(seg->first_page, M_FICT_PAGES); 1124 free(seg, M_FICT_PAGES); 1125 } 1126 1127 /* 1128 * Free a contiguous, power of two-sized set of physical pages. 1129 * 1130 * The free page queues must be locked. 1131 */ 1132 void 1133 vm_phys_free_pages(vm_page_t m, int order) 1134 { 1135 struct vm_freelist *fl; 1136 struct vm_phys_seg *seg; 1137 vm_paddr_t pa; 1138 vm_page_t m_buddy; 1139 1140 KASSERT(m->order == VM_NFREEORDER, 1141 ("vm_phys_free_pages: page %p has unexpected order %d", 1142 m, m->order)); 1143 KASSERT(m->pool < VM_NFREEPOOL, 1144 ("vm_phys_free_pages: page %p has unexpected pool %d", 1145 m, m->pool)); 1146 KASSERT(order < VM_NFREEORDER, 1147 ("vm_phys_free_pages: order %d is out of range", order)); 1148 seg = &vm_phys_segs[m->segind]; 1149 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1150 if (order < VM_NFREEORDER - 1) { 1151 pa = VM_PAGE_TO_PHYS(m); 1152 do { 1153 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1154 if (pa < seg->start || pa >= seg->end) 1155 break; 1156 m_buddy = &seg->first_page[atop(pa - seg->start)]; 1157 if (m_buddy->order != order) 1158 break; 1159 fl = (*seg->free_queues)[m_buddy->pool]; 1160 vm_freelist_rem(fl, m_buddy, order); 1161 if (m_buddy->pool != m->pool) 1162 vm_phys_set_pool(m->pool, m_buddy, order); 1163 order++; 1164 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1165 m = &seg->first_page[atop(pa - seg->start)]; 1166 } while (order < VM_NFREEORDER - 1); 1167 } 1168 fl = (*seg->free_queues)[m->pool]; 1169 vm_freelist_add(fl, m, order, 1); 1170 } 1171 1172 /* 1173 * Return the largest possible order of a set of pages starting at m. 1174 */ 1175 static int 1176 max_order(vm_page_t m) 1177 { 1178 1179 /* 1180 * Unsigned "min" is used here so that "order" is assigned 1181 * "VM_NFREEORDER - 1" when "m"'s physical address is zero 1182 * or the low-order bits of its physical address are zero 1183 * because the size of a physical address exceeds the size of 1184 * a long. 1185 */ 1186 return (min(ffsll(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, 1187 VM_NFREEORDER - 1)); 1188 } 1189 1190 /* 1191 * Free a contiguous, arbitrarily sized set of physical pages, without 1192 * merging across set boundaries. 1193 * 1194 * The free page queues must be locked. 1195 */ 1196 void 1197 vm_phys_enqueue_contig(vm_page_t m, u_long npages) 1198 { 1199 struct vm_freelist *fl; 1200 struct vm_phys_seg *seg; 1201 vm_page_t m_end; 1202 vm_paddr_t diff, lo; 1203 int order; 1204 1205 /* 1206 * Avoid unnecessary coalescing by freeing the pages in the largest 1207 * possible power-of-two-sized subsets. 1208 */ 1209 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1210 seg = &vm_phys_segs[m->segind]; 1211 fl = (*seg->free_queues)[m->pool]; 1212 m_end = m + npages; 1213 /* Free blocks of increasing size. */ 1214 lo = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT; 1215 if (m < m_end && 1216 (diff = lo ^ (lo + npages - 1)) != 0) { 1217 order = min(flsll(diff) - 1, VM_NFREEORDER - 1); 1218 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1); 1219 } 1220 1221 /* Free blocks of maximum size. */ 1222 order = VM_NFREEORDER - 1; 1223 while (m + (1 << order) <= m_end) { 1224 KASSERT(seg == &vm_phys_segs[m->segind], 1225 ("%s: page range [%p,%p) spans multiple segments", 1226 __func__, m_end - npages, m)); 1227 vm_freelist_add(fl, m, order, 1); 1228 m += 1 << order; 1229 } 1230 /* Free blocks of diminishing size. */ 1231 vm_phys_enq_beg(m, m_end - m, fl, 1); 1232 } 1233 1234 /* 1235 * Free a contiguous, arbitrarily sized set of physical pages. 1236 * 1237 * The free page queues must be locked. 1238 */ 1239 void 1240 vm_phys_free_contig(vm_page_t m, u_long npages) 1241 { 1242 int order_start, order_end; 1243 vm_page_t m_start, m_end; 1244 1245 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1246 1247 m_start = m; 1248 order_start = max_order(m_start); 1249 if (order_start < VM_NFREEORDER - 1) 1250 m_start += 1 << order_start; 1251 m_end = m + npages; 1252 order_end = max_order(m_end); 1253 if (order_end < VM_NFREEORDER - 1) 1254 m_end -= 1 << order_end; 1255 /* 1256 * Avoid unnecessary coalescing by freeing the pages at the start and 1257 * end of the range last. 1258 */ 1259 if (m_start < m_end) 1260 vm_phys_enqueue_contig(m_start, m_end - m_start); 1261 if (order_start < VM_NFREEORDER - 1) 1262 vm_phys_free_pages(m, order_start); 1263 if (order_end < VM_NFREEORDER - 1) 1264 vm_phys_free_pages(m_end, order_end); 1265 } 1266 1267 /* 1268 * Identify the first address range within segment segind or greater 1269 * that matches the domain, lies within the low/high range, and has 1270 * enough pages. Return -1 if there is none. 1271 */ 1272 int 1273 vm_phys_find_range(vm_page_t bounds[], int segind, int domain, 1274 u_long npages, vm_paddr_t low, vm_paddr_t high) 1275 { 1276 vm_paddr_t pa_end, pa_start; 1277 struct vm_phys_seg *end_seg, *seg; 1278 1279 KASSERT(npages > 0, ("npages is zero")); 1280 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range")); 1281 end_seg = &vm_phys_segs[vm_phys_nsegs]; 1282 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) { 1283 if (seg->domain != domain) 1284 continue; 1285 if (seg->start >= high) 1286 return (-1); 1287 pa_start = MAX(low, seg->start); 1288 pa_end = MIN(high, seg->end); 1289 if (pa_end - pa_start < ptoa(npages)) 1290 continue; 1291 bounds[0] = &seg->first_page[atop(pa_start - seg->start)]; 1292 bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; 1293 return (seg - vm_phys_segs); 1294 } 1295 return (-1); 1296 } 1297 1298 /* 1299 * Search for the given physical page "m" in the free lists. If the search 1300 * succeeds, remove "m" from the free lists and return true. Otherwise, return 1301 * false, indicating that "m" is not in the free lists. 1302 * 1303 * The free page queues must be locked. 1304 */ 1305 bool 1306 vm_phys_unfree_page(vm_page_t m) 1307 { 1308 struct vm_freelist *fl; 1309 struct vm_phys_seg *seg; 1310 vm_paddr_t pa, pa_half; 1311 vm_page_t m_set, m_tmp; 1312 int order; 1313 1314 /* 1315 * First, find the contiguous, power of two-sized set of free 1316 * physical pages containing the given physical page "m" and 1317 * assign it to "m_set". 1318 */ 1319 seg = &vm_phys_segs[m->segind]; 1320 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1321 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1322 order < VM_NFREEORDER - 1; ) { 1323 order++; 1324 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1325 if (pa >= seg->start) 1326 m_set = &seg->first_page[atop(pa - seg->start)]; 1327 else 1328 return (false); 1329 } 1330 if (m_set->order < order) 1331 return (false); 1332 if (m_set->order == VM_NFREEORDER) 1333 return (false); 1334 KASSERT(m_set->order < VM_NFREEORDER, 1335 ("vm_phys_unfree_page: page %p has unexpected order %d", 1336 m_set, m_set->order)); 1337 1338 /* 1339 * Next, remove "m_set" from the free lists. Finally, extract 1340 * "m" from "m_set" using an iterative algorithm: While "m_set" 1341 * is larger than a page, shrink "m_set" by returning the half 1342 * of "m_set" that does not contain "m" to the free lists. 1343 */ 1344 fl = (*seg->free_queues)[m_set->pool]; 1345 order = m_set->order; 1346 vm_freelist_rem(fl, m_set, order); 1347 while (order > 0) { 1348 order--; 1349 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1350 if (m->phys_addr < pa_half) 1351 m_tmp = &seg->first_page[atop(pa_half - seg->start)]; 1352 else { 1353 m_tmp = m_set; 1354 m_set = &seg->first_page[atop(pa_half - seg->start)]; 1355 } 1356 vm_freelist_add(fl, m_tmp, order, 0); 1357 } 1358 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1359 return (true); 1360 } 1361 1362 /* 1363 * Find a run of contiguous physical pages from the specified page list. 1364 */ 1365 static vm_page_t 1366 vm_phys_find_freelist_contig(struct vm_freelist *fl, int oind, u_long npages, 1367 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1368 { 1369 struct vm_phys_seg *seg; 1370 vm_paddr_t frag, lbound, pa, page_size, pa_end, pa_pre, size; 1371 vm_page_t m, m_listed, m_ret; 1372 int order; 1373 1374 KASSERT(npages > 0, ("npages is 0")); 1375 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1376 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1377 /* Search for a run satisfying the specified conditions. */ 1378 page_size = PAGE_SIZE; 1379 size = npages << PAGE_SHIFT; 1380 frag = (npages & ~(~0UL << oind)) << PAGE_SHIFT; 1381 TAILQ_FOREACH(m_listed, &fl[oind].pl, listq) { 1382 /* 1383 * Determine if the address range starting at pa is 1384 * too low. 1385 */ 1386 pa = VM_PAGE_TO_PHYS(m_listed); 1387 if (pa < low) 1388 continue; 1389 1390 /* 1391 * If this is not the first free oind-block in this range, bail 1392 * out. We have seen the first free block already, or will see 1393 * it before failing to find an appropriate range. 1394 */ 1395 seg = &vm_phys_segs[m_listed->segind]; 1396 lbound = low > seg->start ? low : seg->start; 1397 pa_pre = pa - (page_size << oind); 1398 m = &seg->first_page[atop(pa_pre - seg->start)]; 1399 if (pa != 0 && pa_pre >= lbound && m->order == oind) 1400 continue; 1401 1402 if (!vm_addr_align_ok(pa, alignment)) 1403 /* Advance to satisfy alignment condition. */ 1404 pa = roundup2(pa, alignment); 1405 else if (frag != 0 && lbound + frag <= pa) { 1406 /* 1407 * Back up to the first aligned free block in this 1408 * range, without moving below lbound. 1409 */ 1410 pa_end = pa; 1411 for (order = oind - 1; order >= 0; order--) { 1412 pa_pre = pa_end - (page_size << order); 1413 if (!vm_addr_align_ok(pa_pre, alignment)) 1414 break; 1415 m = &seg->first_page[atop(pa_pre - seg->start)]; 1416 if (pa_pre >= lbound && m->order == order) 1417 pa_end = pa_pre; 1418 } 1419 /* 1420 * If the extra small blocks are enough to complete the 1421 * fragment, use them. Otherwise, look to allocate the 1422 * fragment at the other end. 1423 */ 1424 if (pa_end + frag <= pa) 1425 pa = pa_end; 1426 } 1427 1428 /* Advance as necessary to satisfy boundary conditions. */ 1429 if (!vm_addr_bound_ok(pa, size, boundary)) 1430 pa = roundup2(pa + 1, boundary); 1431 pa_end = pa + size; 1432 1433 /* 1434 * Determine if the address range is valid (without overflow in 1435 * pa_end calculation), and fits within the segment. 1436 */ 1437 if (pa_end < pa || seg->end < pa_end) 1438 continue; 1439 1440 m_ret = &seg->first_page[atop(pa - seg->start)]; 1441 1442 /* 1443 * Determine whether there are enough free oind-blocks here to 1444 * satisfy the allocation request. 1445 */ 1446 pa = VM_PAGE_TO_PHYS(m_listed); 1447 do { 1448 pa += page_size << oind; 1449 if (pa >= pa_end) 1450 return (m_ret); 1451 m = &seg->first_page[atop(pa - seg->start)]; 1452 } while (oind == m->order); 1453 1454 /* 1455 * Determine if an additional series of free blocks of 1456 * diminishing size can help to satisfy the allocation request. 1457 */ 1458 while (m->order < oind && 1459 pa + 2 * (page_size << m->order) > pa_end) { 1460 pa += page_size << m->order; 1461 if (pa >= pa_end) 1462 return (m_ret); 1463 m = &seg->first_page[atop(pa - seg->start)]; 1464 } 1465 } 1466 return (NULL); 1467 } 1468 1469 /* 1470 * Find a run of contiguous physical pages from the specified free list 1471 * table. 1472 */ 1473 static vm_page_t 1474 vm_phys_find_queues_contig( 1475 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], 1476 u_long npages, vm_paddr_t low, vm_paddr_t high, 1477 u_long alignment, vm_paddr_t boundary) 1478 { 1479 struct vm_freelist *fl; 1480 vm_page_t m_ret; 1481 vm_paddr_t pa, pa_end, size; 1482 int oind, order, pind; 1483 1484 KASSERT(npages > 0, ("npages is 0")); 1485 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1486 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1487 /* Compute the queue that is the best fit for npages. */ 1488 order = flsl(npages - 1); 1489 /* Search for a large enough free block. */ 1490 size = npages << PAGE_SHIFT; 1491 for (oind = order; oind < VM_NFREEORDER; oind++) { 1492 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1493 fl = (*queues)[pind]; 1494 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1495 /* 1496 * Determine if the address range starting at pa 1497 * is within the given range, satisfies the 1498 * given alignment, and does not cross the given 1499 * boundary. 1500 */ 1501 pa = VM_PAGE_TO_PHYS(m_ret); 1502 pa_end = pa + size; 1503 if (low <= pa && pa_end <= high && 1504 vm_addr_ok(pa, size, alignment, boundary)) 1505 return (m_ret); 1506 } 1507 } 1508 } 1509 if (order < VM_NFREEORDER) 1510 return (NULL); 1511 /* Search for a long-enough sequence of small blocks. */ 1512 oind = VM_NFREEORDER - 1; 1513 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1514 fl = (*queues)[pind]; 1515 m_ret = vm_phys_find_freelist_contig(fl, oind, npages, 1516 low, high, alignment, boundary); 1517 if (m_ret != NULL) 1518 return (m_ret); 1519 } 1520 return (NULL); 1521 } 1522 1523 /* 1524 * Allocate a contiguous set of physical pages of the given size 1525 * "npages" from the free lists. All of the physical pages must be at 1526 * or above the given physical address "low" and below the given 1527 * physical address "high". The given value "alignment" determines the 1528 * alignment of the first physical page in the set. If the given value 1529 * "boundary" is non-zero, then the set of physical pages cannot cross 1530 * any physical address boundary that is a multiple of that value. Both 1531 * "alignment" and "boundary" must be a power of two. 1532 */ 1533 vm_page_t 1534 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1535 u_long alignment, vm_paddr_t boundary) 1536 { 1537 vm_paddr_t pa_end, pa_start; 1538 struct vm_freelist *fl; 1539 vm_page_t m, m_run; 1540 struct vm_phys_seg *seg; 1541 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; 1542 int oind, segind; 1543 1544 KASSERT(npages > 0, ("npages is 0")); 1545 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1546 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1547 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1548 if (low >= high) 1549 return (NULL); 1550 queues = NULL; 1551 m_run = NULL; 1552 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1553 seg = &vm_phys_segs[segind]; 1554 if (seg->start >= high || seg->domain != domain) 1555 continue; 1556 if (low >= seg->end) 1557 break; 1558 if (low <= seg->start) 1559 pa_start = seg->start; 1560 else 1561 pa_start = low; 1562 if (high < seg->end) 1563 pa_end = high; 1564 else 1565 pa_end = seg->end; 1566 if (pa_end - pa_start < ptoa(npages)) 1567 continue; 1568 /* 1569 * If a previous segment led to a search using 1570 * the same free lists as would this segment, then 1571 * we've actually already searched within this 1572 * too. So skip it. 1573 */ 1574 if (seg->free_queues == queues) 1575 continue; 1576 queues = seg->free_queues; 1577 m_run = vm_phys_find_queues_contig(queues, npages, 1578 low, high, alignment, boundary); 1579 if (m_run != NULL) 1580 break; 1581 } 1582 if (m_run == NULL) 1583 return (NULL); 1584 1585 /* Allocate pages from the page-range found. */ 1586 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { 1587 fl = (*queues)[m->pool]; 1588 oind = m->order; 1589 vm_freelist_rem(fl, m, oind); 1590 if (m->pool != VM_FREEPOOL_DEFAULT) 1591 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1592 } 1593 /* Return excess pages to the free lists. */ 1594 fl = (*queues)[VM_FREEPOOL_DEFAULT]; 1595 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0); 1596 return (m_run); 1597 } 1598 1599 /* 1600 * Return the index of the first unused slot which may be the terminating 1601 * entry. 1602 */ 1603 static int 1604 vm_phys_avail_count(void) 1605 { 1606 int i; 1607 1608 for (i = 0; phys_avail[i + 1]; i += 2) 1609 continue; 1610 if (i > PHYS_AVAIL_ENTRIES) 1611 panic("Improperly terminated phys_avail %d entries", i); 1612 1613 return (i); 1614 } 1615 1616 /* 1617 * Assert that a phys_avail entry is valid. 1618 */ 1619 static void 1620 vm_phys_avail_check(int i) 1621 { 1622 if (phys_avail[i] & PAGE_MASK) 1623 panic("Unaligned phys_avail[%d]: %#jx", i, 1624 (intmax_t)phys_avail[i]); 1625 if (phys_avail[i+1] & PAGE_MASK) 1626 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1627 (intmax_t)phys_avail[i]); 1628 if (phys_avail[i + 1] < phys_avail[i]) 1629 panic("phys_avail[%d] start %#jx < end %#jx", i, 1630 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1631 } 1632 1633 /* 1634 * Return the index of an overlapping phys_avail entry or -1. 1635 */ 1636 #ifdef NUMA 1637 static int 1638 vm_phys_avail_find(vm_paddr_t pa) 1639 { 1640 int i; 1641 1642 for (i = 0; phys_avail[i + 1]; i += 2) 1643 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1644 return (i); 1645 return (-1); 1646 } 1647 #endif 1648 1649 /* 1650 * Return the index of the largest entry. 1651 */ 1652 int 1653 vm_phys_avail_largest(void) 1654 { 1655 vm_paddr_t sz, largesz; 1656 int largest; 1657 int i; 1658 1659 largest = 0; 1660 largesz = 0; 1661 for (i = 0; phys_avail[i + 1]; i += 2) { 1662 sz = vm_phys_avail_size(i); 1663 if (sz > largesz) { 1664 largesz = sz; 1665 largest = i; 1666 } 1667 } 1668 1669 return (largest); 1670 } 1671 1672 vm_paddr_t 1673 vm_phys_avail_size(int i) 1674 { 1675 1676 return (phys_avail[i + 1] - phys_avail[i]); 1677 } 1678 1679 /* 1680 * Split an entry at the address 'pa'. Return zero on success or errno. 1681 */ 1682 static int 1683 vm_phys_avail_split(vm_paddr_t pa, int i) 1684 { 1685 int cnt; 1686 1687 vm_phys_avail_check(i); 1688 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1689 panic("vm_phys_avail_split: invalid address"); 1690 cnt = vm_phys_avail_count(); 1691 if (cnt >= PHYS_AVAIL_ENTRIES) 1692 return (ENOSPC); 1693 memmove(&phys_avail[i + 2], &phys_avail[i], 1694 (cnt - i) * sizeof(phys_avail[0])); 1695 phys_avail[i + 1] = pa; 1696 phys_avail[i + 2] = pa; 1697 vm_phys_avail_check(i); 1698 vm_phys_avail_check(i+2); 1699 1700 return (0); 1701 } 1702 1703 /* 1704 * Check if a given physical address can be included as part of a crash dump. 1705 */ 1706 bool 1707 vm_phys_is_dumpable(vm_paddr_t pa) 1708 { 1709 vm_page_t m; 1710 int i; 1711 1712 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1713 return ((m->flags & PG_NODUMP) == 0); 1714 1715 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1716 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1717 return (true); 1718 } 1719 return (false); 1720 } 1721 1722 void 1723 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1724 { 1725 struct vm_phys_seg *seg; 1726 1727 if (vm_phys_early_nsegs == -1) 1728 panic("%s: called after initialization", __func__); 1729 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1730 panic("%s: ran out of early segments", __func__); 1731 1732 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1733 seg->start = start; 1734 seg->end = end; 1735 } 1736 1737 /* 1738 * This routine allocates NUMA node specific memory before the page 1739 * allocator is bootstrapped. 1740 */ 1741 vm_paddr_t 1742 vm_phys_early_alloc(int domain, size_t alloc_size) 1743 { 1744 #ifdef NUMA 1745 int mem_index; 1746 #endif 1747 int i, biggestone; 1748 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1749 1750 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1751 ("%s: invalid domain index %d", __func__, domain)); 1752 1753 /* 1754 * Search the mem_affinity array for the biggest address 1755 * range in the desired domain. This is used to constrain 1756 * the phys_avail selection below. 1757 */ 1758 biggestsize = 0; 1759 mem_start = 0; 1760 mem_end = -1; 1761 #ifdef NUMA 1762 mem_index = 0; 1763 if (mem_affinity != NULL) { 1764 for (i = 0;; i++) { 1765 size = mem_affinity[i].end - mem_affinity[i].start; 1766 if (size == 0) 1767 break; 1768 if (domain != -1 && mem_affinity[i].domain != domain) 1769 continue; 1770 if (size > biggestsize) { 1771 mem_index = i; 1772 biggestsize = size; 1773 } 1774 } 1775 mem_start = mem_affinity[mem_index].start; 1776 mem_end = mem_affinity[mem_index].end; 1777 } 1778 #endif 1779 1780 /* 1781 * Now find biggest physical segment in within the desired 1782 * numa domain. 1783 */ 1784 biggestsize = 0; 1785 biggestone = 0; 1786 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1787 /* skip regions that are out of range */ 1788 if (phys_avail[i+1] - alloc_size < mem_start || 1789 phys_avail[i+1] > mem_end) 1790 continue; 1791 size = vm_phys_avail_size(i); 1792 if (size > biggestsize) { 1793 biggestone = i; 1794 biggestsize = size; 1795 } 1796 } 1797 alloc_size = round_page(alloc_size); 1798 1799 /* 1800 * Grab single pages from the front to reduce fragmentation. 1801 */ 1802 if (alloc_size == PAGE_SIZE) { 1803 pa = phys_avail[biggestone]; 1804 phys_avail[biggestone] += PAGE_SIZE; 1805 vm_phys_avail_check(biggestone); 1806 return (pa); 1807 } 1808 1809 /* 1810 * Naturally align large allocations. 1811 */ 1812 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1813 if (alloc_size + align > biggestsize) 1814 panic("cannot find a large enough size\n"); 1815 if (align != 0 && 1816 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1817 biggestone) != 0) 1818 /* Wasting memory. */ 1819 phys_avail[biggestone + 1] -= align; 1820 1821 phys_avail[biggestone + 1] -= alloc_size; 1822 vm_phys_avail_check(biggestone); 1823 pa = phys_avail[biggestone + 1]; 1824 return (pa); 1825 } 1826 1827 void 1828 vm_phys_early_startup(void) 1829 { 1830 struct vm_phys_seg *seg; 1831 int i; 1832 1833 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1834 phys_avail[i] = round_page(phys_avail[i]); 1835 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1836 } 1837 1838 for (i = 0; i < vm_phys_early_nsegs; i++) { 1839 seg = &vm_phys_early_segs[i]; 1840 vm_phys_add_seg(seg->start, seg->end); 1841 } 1842 vm_phys_early_nsegs = -1; 1843 1844 #ifdef NUMA 1845 /* Force phys_avail to be split by domain. */ 1846 if (mem_affinity != NULL) { 1847 int idx; 1848 1849 for (i = 0; mem_affinity[i].end != 0; i++) { 1850 idx = vm_phys_avail_find(mem_affinity[i].start); 1851 if (idx != -1 && 1852 phys_avail[idx] != mem_affinity[i].start) 1853 vm_phys_avail_split(mem_affinity[i].start, idx); 1854 idx = vm_phys_avail_find(mem_affinity[i].end); 1855 if (idx != -1 && 1856 phys_avail[idx] != mem_affinity[i].end) 1857 vm_phys_avail_split(mem_affinity[i].end, idx); 1858 } 1859 } 1860 #endif 1861 } 1862 1863 #ifdef DDB 1864 /* 1865 * Show the number of physical pages in each of the free lists. 1866 */ 1867 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) 1868 { 1869 struct vm_freelist *fl; 1870 int flind, oind, pind, dom; 1871 1872 for (dom = 0; dom < vm_ndomains; dom++) { 1873 db_printf("DOMAIN: %d\n", dom); 1874 for (flind = 0; flind < vm_nfreelists; flind++) { 1875 db_printf("FREE LIST %d:\n" 1876 "\n ORDER (SIZE) | NUMBER" 1877 "\n ", flind); 1878 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1879 db_printf(" | POOL %d", pind); 1880 db_printf("\n-- "); 1881 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1882 db_printf("-- -- "); 1883 db_printf("--\n"); 1884 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 1885 db_printf(" %2.2d (%6.6dK)", oind, 1886 1 << (PAGE_SHIFT - 10 + oind)); 1887 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1888 fl = vm_phys_free_queues[dom][flind][pind]; 1889 db_printf(" | %6.6d", fl[oind].lcnt); 1890 } 1891 db_printf("\n"); 1892 } 1893 db_printf("\n"); 1894 } 1895 db_printf("\n"); 1896 } 1897 } 1898 #endif 1899