1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_vm.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/domainset.h> 48 #include <sys/lock.h> 49 #include <sys/kernel.h> 50 #include <sys/malloc.h> 51 #include <sys/mutex.h> 52 #include <sys/proc.h> 53 #include <sys/queue.h> 54 #include <sys/rwlock.h> 55 #include <sys/sbuf.h> 56 #include <sys/sysctl.h> 57 #include <sys/tree.h> 58 #include <sys/vmmeter.h> 59 60 #include <ddb/ddb.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_extern.h> 64 #include <vm/vm_param.h> 65 #include <vm/vm_kern.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_phys.h> 69 #include <vm/vm_pagequeue.h> 70 71 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 72 "Too many physsegs."); 73 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t), 74 "vm_paddr_t too big for ffsll, flsll."); 75 76 #ifdef NUMA 77 struct mem_affinity __read_mostly *mem_affinity; 78 int __read_mostly *mem_locality; 79 80 static int numa_disabled; 81 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 82 "NUMA options"); 83 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 84 &numa_disabled, 0, "NUMA-awareness in the allocators is disabled"); 85 #endif 86 87 int __read_mostly vm_ndomains = 1; 88 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 89 90 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 91 int __read_mostly vm_phys_nsegs; 92 static struct vm_phys_seg vm_phys_early_segs[8]; 93 static int vm_phys_early_nsegs; 94 95 struct vm_phys_fictitious_seg; 96 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 97 struct vm_phys_fictitious_seg *); 98 99 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 100 RB_INITIALIZER(&vm_phys_fictitious_tree); 101 102 struct vm_phys_fictitious_seg { 103 RB_ENTRY(vm_phys_fictitious_seg) node; 104 /* Memory region data */ 105 vm_paddr_t start; 106 vm_paddr_t end; 107 vm_page_t first_page; 108 }; 109 110 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 111 vm_phys_fictitious_cmp); 112 113 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 114 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 115 116 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 117 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 118 [VM_NFREEORDER_MAX]; 119 120 static int __read_mostly vm_nfreelists; 121 122 /* 123 * These "avail lists" are globals used to communicate boot-time physical 124 * memory layout to other parts of the kernel. Each physically contiguous 125 * region of memory is defined by a start address at an even index and an 126 * end address at the following odd index. Each list is terminated by a 127 * pair of zero entries. 128 * 129 * dump_avail tells the dump code what regions to include in a crash dump, and 130 * phys_avail is all of the remaining physical memory that is available for 131 * the vm system. 132 * 133 * Initially dump_avail and phys_avail are identical. Boot time memory 134 * allocations remove extents from phys_avail that may still be included 135 * in dumps. 136 */ 137 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 138 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 139 140 /* 141 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 142 */ 143 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 144 145 CTASSERT(VM_FREELIST_DEFAULT == 0); 146 147 #ifdef VM_FREELIST_DMA32 148 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 149 #endif 150 151 /* 152 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 153 * the ordering of the free list boundaries. 154 */ 155 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 156 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 157 #endif 158 159 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 160 SYSCTL_OID(_vm, OID_AUTO, phys_free, 161 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 162 sysctl_vm_phys_free, "A", 163 "Phys Free Info"); 164 165 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 166 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 167 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 168 sysctl_vm_phys_segs, "A", 169 "Phys Seg Info"); 170 171 #ifdef NUMA 172 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 173 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 174 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 175 sysctl_vm_phys_locality, "A", 176 "Phys Locality Info"); 177 #endif 178 179 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 180 &vm_ndomains, 0, "Number of physical memory domains available."); 181 182 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 183 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 184 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 185 int order, int tail); 186 187 /* 188 * Red-black tree helpers for vm fictitious range management. 189 */ 190 static inline int 191 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 192 struct vm_phys_fictitious_seg *range) 193 { 194 195 KASSERT(range->start != 0 && range->end != 0, 196 ("Invalid range passed on search for vm_fictitious page")); 197 if (p->start >= range->end) 198 return (1); 199 if (p->start < range->start) 200 return (-1); 201 202 return (0); 203 } 204 205 static int 206 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 207 struct vm_phys_fictitious_seg *p2) 208 { 209 210 /* Check if this is a search for a page */ 211 if (p1->end == 0) 212 return (vm_phys_fictitious_in_range(p1, p2)); 213 214 KASSERT(p2->end != 0, 215 ("Invalid range passed as second parameter to vm fictitious comparison")); 216 217 /* Searching to add a new range */ 218 if (p1->end <= p2->start) 219 return (-1); 220 if (p1->start >= p2->end) 221 return (1); 222 223 panic("Trying to add overlapping vm fictitious ranges:\n" 224 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 225 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 226 } 227 228 int 229 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used, 230 vm_paddr_t high __numa_used) 231 { 232 #ifdef NUMA 233 domainset_t mask; 234 int i; 235 236 if (vm_ndomains == 1 || mem_affinity == NULL) 237 return (0); 238 239 DOMAINSET_ZERO(&mask); 240 /* 241 * Check for any memory that overlaps low, high. 242 */ 243 for (i = 0; mem_affinity[i].end != 0; i++) 244 if (mem_affinity[i].start <= high && 245 mem_affinity[i].end >= low) 246 DOMAINSET_SET(mem_affinity[i].domain, &mask); 247 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 248 return (prefer); 249 if (DOMAINSET_EMPTY(&mask)) 250 panic("vm_phys_domain_match: Impossible constraint"); 251 return (DOMAINSET_FFS(&mask) - 1); 252 #else 253 return (0); 254 #endif 255 } 256 257 /* 258 * Outputs the state of the physical memory allocator, specifically, 259 * the amount of physical memory in each free list. 260 */ 261 static int 262 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 263 { 264 struct sbuf sbuf; 265 struct vm_freelist *fl; 266 int dom, error, flind, oind, pind; 267 268 error = sysctl_wire_old_buffer(req, 0); 269 if (error != 0) 270 return (error); 271 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 272 for (dom = 0; dom < vm_ndomains; dom++) { 273 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 274 for (flind = 0; flind < vm_nfreelists; flind++) { 275 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 276 "\n ORDER (SIZE) | NUMBER" 277 "\n ", flind); 278 for (pind = 0; pind < VM_NFREEPOOL; pind++) 279 sbuf_printf(&sbuf, " | POOL %d", pind); 280 sbuf_printf(&sbuf, "\n-- "); 281 for (pind = 0; pind < VM_NFREEPOOL; pind++) 282 sbuf_printf(&sbuf, "-- -- "); 283 sbuf_printf(&sbuf, "--\n"); 284 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 285 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 286 1 << (PAGE_SHIFT - 10 + oind)); 287 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 288 fl = vm_phys_free_queues[dom][flind][pind]; 289 sbuf_printf(&sbuf, " | %6d", 290 fl[oind].lcnt); 291 } 292 sbuf_printf(&sbuf, "\n"); 293 } 294 } 295 } 296 error = sbuf_finish(&sbuf); 297 sbuf_delete(&sbuf); 298 return (error); 299 } 300 301 /* 302 * Outputs the set of physical memory segments. 303 */ 304 static int 305 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 306 { 307 struct sbuf sbuf; 308 struct vm_phys_seg *seg; 309 int error, segind; 310 311 error = sysctl_wire_old_buffer(req, 0); 312 if (error != 0) 313 return (error); 314 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 315 for (segind = 0; segind < vm_phys_nsegs; segind++) { 316 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 317 seg = &vm_phys_segs[segind]; 318 sbuf_printf(&sbuf, "start: %#jx\n", 319 (uintmax_t)seg->start); 320 sbuf_printf(&sbuf, "end: %#jx\n", 321 (uintmax_t)seg->end); 322 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 323 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 324 } 325 error = sbuf_finish(&sbuf); 326 sbuf_delete(&sbuf); 327 return (error); 328 } 329 330 /* 331 * Return affinity, or -1 if there's no affinity information. 332 */ 333 int 334 vm_phys_mem_affinity(int f __numa_used, int t __numa_used) 335 { 336 337 #ifdef NUMA 338 if (mem_locality == NULL) 339 return (-1); 340 if (f >= vm_ndomains || t >= vm_ndomains) 341 return (-1); 342 return (mem_locality[f * vm_ndomains + t]); 343 #else 344 return (-1); 345 #endif 346 } 347 348 #ifdef NUMA 349 /* 350 * Outputs the VM locality table. 351 */ 352 static int 353 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 354 { 355 struct sbuf sbuf; 356 int error, i, j; 357 358 error = sysctl_wire_old_buffer(req, 0); 359 if (error != 0) 360 return (error); 361 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 362 363 sbuf_printf(&sbuf, "\n"); 364 365 for (i = 0; i < vm_ndomains; i++) { 366 sbuf_printf(&sbuf, "%d: ", i); 367 for (j = 0; j < vm_ndomains; j++) { 368 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 369 } 370 sbuf_printf(&sbuf, "\n"); 371 } 372 error = sbuf_finish(&sbuf); 373 sbuf_delete(&sbuf); 374 return (error); 375 } 376 #endif 377 378 static void 379 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 380 { 381 382 m->order = order; 383 if (tail) 384 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 385 else 386 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 387 fl[order].lcnt++; 388 } 389 390 static void 391 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 392 { 393 394 TAILQ_REMOVE(&fl[order].pl, m, listq); 395 fl[order].lcnt--; 396 m->order = VM_NFREEORDER; 397 } 398 399 /* 400 * Create a physical memory segment. 401 */ 402 static void 403 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 404 { 405 struct vm_phys_seg *seg; 406 407 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 408 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 409 KASSERT(domain >= 0 && domain < vm_ndomains, 410 ("vm_phys_create_seg: invalid domain provided")); 411 seg = &vm_phys_segs[vm_phys_nsegs++]; 412 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 413 *seg = *(seg - 1); 414 seg--; 415 } 416 seg->start = start; 417 seg->end = end; 418 seg->domain = domain; 419 } 420 421 static void 422 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 423 { 424 #ifdef NUMA 425 int i; 426 427 if (mem_affinity == NULL) { 428 _vm_phys_create_seg(start, end, 0); 429 return; 430 } 431 432 for (i = 0;; i++) { 433 if (mem_affinity[i].end == 0) 434 panic("Reached end of affinity info"); 435 if (mem_affinity[i].end <= start) 436 continue; 437 if (mem_affinity[i].start > start) 438 panic("No affinity info for start %jx", 439 (uintmax_t)start); 440 if (mem_affinity[i].end >= end) { 441 _vm_phys_create_seg(start, end, 442 mem_affinity[i].domain); 443 break; 444 } 445 _vm_phys_create_seg(start, mem_affinity[i].end, 446 mem_affinity[i].domain); 447 start = mem_affinity[i].end; 448 } 449 #else 450 _vm_phys_create_seg(start, end, 0); 451 #endif 452 } 453 454 /* 455 * Add a physical memory segment. 456 */ 457 void 458 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 459 { 460 vm_paddr_t paddr; 461 462 KASSERT((start & PAGE_MASK) == 0, 463 ("vm_phys_define_seg: start is not page aligned")); 464 KASSERT((end & PAGE_MASK) == 0, 465 ("vm_phys_define_seg: end is not page aligned")); 466 467 /* 468 * Split the physical memory segment if it spans two or more free 469 * list boundaries. 470 */ 471 paddr = start; 472 #ifdef VM_FREELIST_LOWMEM 473 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 474 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 475 paddr = VM_LOWMEM_BOUNDARY; 476 } 477 #endif 478 #ifdef VM_FREELIST_DMA32 479 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 480 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 481 paddr = VM_DMA32_BOUNDARY; 482 } 483 #endif 484 vm_phys_create_seg(paddr, end); 485 } 486 487 /* 488 * Initialize the physical memory allocator. 489 * 490 * Requires that vm_page_array is initialized! 491 */ 492 void 493 vm_phys_init(void) 494 { 495 struct vm_freelist *fl; 496 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 497 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) 498 u_long npages; 499 #endif 500 int dom, flind, freelist, oind, pind, segind; 501 502 /* 503 * Compute the number of free lists, and generate the mapping from the 504 * manifest constants VM_FREELIST_* to the free list indices. 505 * 506 * Initially, the entries of vm_freelist_to_flind[] are set to either 507 * 0 or 1 to indicate which free lists should be created. 508 */ 509 #ifdef VM_DMA32_NPAGES_THRESHOLD 510 npages = 0; 511 #endif 512 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 513 seg = &vm_phys_segs[segind]; 514 #ifdef VM_FREELIST_LOWMEM 515 if (seg->end <= VM_LOWMEM_BOUNDARY) 516 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 517 else 518 #endif 519 #ifdef VM_FREELIST_DMA32 520 if ( 521 #ifdef VM_DMA32_NPAGES_THRESHOLD 522 /* 523 * Create the DMA32 free list only if the amount of 524 * physical memory above physical address 4G exceeds the 525 * given threshold. 526 */ 527 npages > VM_DMA32_NPAGES_THRESHOLD && 528 #endif 529 seg->end <= VM_DMA32_BOUNDARY) 530 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 531 else 532 #endif 533 { 534 #ifdef VM_DMA32_NPAGES_THRESHOLD 535 npages += atop(seg->end - seg->start); 536 #endif 537 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 538 } 539 } 540 /* Change each entry into a running total of the free lists. */ 541 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 542 vm_freelist_to_flind[freelist] += 543 vm_freelist_to_flind[freelist - 1]; 544 } 545 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 546 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 547 /* Change each entry into a free list index. */ 548 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 549 vm_freelist_to_flind[freelist]--; 550 551 /* 552 * Initialize the first_page and free_queues fields of each physical 553 * memory segment. 554 */ 555 #ifdef VM_PHYSSEG_SPARSE 556 npages = 0; 557 #endif 558 for (segind = 0; segind < vm_phys_nsegs; segind++) { 559 seg = &vm_phys_segs[segind]; 560 #ifdef VM_PHYSSEG_SPARSE 561 seg->first_page = &vm_page_array[npages]; 562 npages += atop(seg->end - seg->start); 563 #else 564 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 565 #endif 566 #ifdef VM_FREELIST_LOWMEM 567 if (seg->end <= VM_LOWMEM_BOUNDARY) { 568 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 569 KASSERT(flind >= 0, 570 ("vm_phys_init: LOWMEM flind < 0")); 571 } else 572 #endif 573 #ifdef VM_FREELIST_DMA32 574 if (seg->end <= VM_DMA32_BOUNDARY) { 575 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 576 KASSERT(flind >= 0, 577 ("vm_phys_init: DMA32 flind < 0")); 578 } else 579 #endif 580 { 581 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 582 KASSERT(flind >= 0, 583 ("vm_phys_init: DEFAULT flind < 0")); 584 } 585 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 586 } 587 588 /* 589 * Coalesce physical memory segments that are contiguous and share the 590 * same per-domain free queues. 591 */ 592 prev_seg = vm_phys_segs; 593 seg = &vm_phys_segs[1]; 594 end_seg = &vm_phys_segs[vm_phys_nsegs]; 595 while (seg < end_seg) { 596 if (prev_seg->end == seg->start && 597 prev_seg->free_queues == seg->free_queues) { 598 prev_seg->end = seg->end; 599 KASSERT(prev_seg->domain == seg->domain, 600 ("vm_phys_init: free queues cannot span domains")); 601 vm_phys_nsegs--; 602 end_seg--; 603 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 604 *tmp_seg = *(tmp_seg + 1); 605 } else { 606 prev_seg = seg; 607 seg++; 608 } 609 } 610 611 /* 612 * Initialize the free queues. 613 */ 614 for (dom = 0; dom < vm_ndomains; dom++) { 615 for (flind = 0; flind < vm_nfreelists; flind++) { 616 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 617 fl = vm_phys_free_queues[dom][flind][pind]; 618 for (oind = 0; oind < VM_NFREEORDER; oind++) 619 TAILQ_INIT(&fl[oind].pl); 620 } 621 } 622 } 623 624 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 625 } 626 627 /* 628 * Register info about the NUMA topology of the system. 629 * 630 * Invoked by platform-dependent code prior to vm_phys_init(). 631 */ 632 void 633 vm_phys_register_domains(int ndomains __numa_used, 634 struct mem_affinity *affinity __numa_used, int *locality __numa_used) 635 { 636 #ifdef NUMA 637 int i; 638 639 /* 640 * For now the only override value that we support is 1, which 641 * effectively disables NUMA-awareness in the allocators. 642 */ 643 TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled); 644 if (numa_disabled) 645 ndomains = 1; 646 647 if (ndomains > 1) { 648 vm_ndomains = ndomains; 649 mem_affinity = affinity; 650 mem_locality = locality; 651 } 652 653 for (i = 0; i < vm_ndomains; i++) 654 DOMAINSET_SET(i, &all_domains); 655 #endif 656 } 657 658 /* 659 * Split a contiguous, power of two-sized set of physical pages. 660 * 661 * When this function is called by a page allocation function, the caller 662 * should request insertion at the head unless the order [order, oind) queues 663 * are known to be empty. The objective being to reduce the likelihood of 664 * long-term fragmentation by promoting contemporaneous allocation and 665 * (hopefully) deallocation. 666 */ 667 static __inline void 668 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 669 int tail) 670 { 671 vm_page_t m_buddy; 672 int pool = m->pool; 673 674 while (oind > order) { 675 oind--; 676 m_buddy = &m[1 << oind]; 677 KASSERT(m_buddy->order == VM_NFREEORDER, 678 ("vm_phys_split_pages: page %p has unexpected order %d", 679 m_buddy, m_buddy->order)); 680 KASSERT(m_buddy->pool == VM_NFREEPOOL, 681 ("vm_phys_split_pages: page %p has unexpected pool %d", 682 m_buddy, m_buddy->pool)); 683 m_buddy->pool = pool; 684 vm_freelist_add(fl, m_buddy, oind, tail); 685 } 686 } 687 688 /* 689 * Add the physical pages [m, m + npages) at the beginning of a power-of-two 690 * aligned and sized set to the specified free list. 691 * 692 * When this function is called by a page allocation function, the caller 693 * should request insertion at the head unless the lower-order queues are 694 * known to be empty. The objective being to reduce the likelihood of long- 695 * term fragmentation by promoting contemporaneous allocation and (hopefully) 696 * deallocation. 697 * 698 * The physical page m's buddy must not be free. 699 */ 700 static void 701 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool, 702 int tail) 703 { 704 int order; 705 706 KASSERT(npages == 0 || 707 (VM_PAGE_TO_PHYS(m) & 708 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 709 ("%s: page %p and npages %u are misaligned", 710 __func__, m, npages)); 711 while (npages > 0) { 712 KASSERT(m->order == VM_NFREEORDER, 713 ("%s: page %p has unexpected order %d", 714 __func__, m, m->order)); 715 order = fls(npages) - 1; 716 KASSERT(order < VM_NFREEORDER, 717 ("%s: order %d is out of range", __func__, order)); 718 m->pool = pool; 719 vm_freelist_add(fl, m, order, tail); 720 m += 1 << order; 721 npages -= 1 << order; 722 } 723 } 724 725 /* 726 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 727 * and sized set to the specified free list. 728 * 729 * When this function is called by a page allocation function, the caller 730 * should request insertion at the head unless the lower-order queues are 731 * known to be empty. The objective being to reduce the likelihood of long- 732 * term fragmentation by promoting contemporaneous allocation and (hopefully) 733 * deallocation. 734 * 735 * If npages is zero, this function does nothing and ignores the physical page 736 * parameter m. Otherwise, the physical page m's buddy must not be free. 737 */ 738 static vm_page_t 739 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool, 740 int tail) 741 { 742 int order; 743 744 KASSERT(npages == 0 || 745 ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 746 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 747 ("vm_phys_enq_range: page %p and npages %u are misaligned", 748 m, npages)); 749 while (npages > 0) { 750 KASSERT(m->order == VM_NFREEORDER, 751 ("vm_phys_enq_range: page %p has unexpected order %d", 752 m, m->order)); 753 order = ffs(npages) - 1; 754 KASSERT(order < VM_NFREEORDER, 755 ("vm_phys_enq_range: order %d is out of range", order)); 756 m->pool = pool; 757 vm_freelist_add(fl, m, order, tail); 758 m += 1 << order; 759 npages -= 1 << order; 760 } 761 return (m); 762 } 763 764 /* 765 * Tries to allocate the specified number of pages from the specified pool 766 * within the specified domain. Returns the actual number of allocated pages 767 * and a pointer to each page through the array ma[]. 768 * 769 * The returned pages may not be physically contiguous. However, in contrast 770 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 771 * calling this function once to allocate the desired number of pages will 772 * avoid wasted time in vm_phys_split_pages(). Sets the pool field for 773 * every allocated page. 774 * 775 * The free page queues for the specified domain must be locked. 776 */ 777 int 778 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 779 { 780 struct vm_freelist *alt, *fl; 781 vm_page_t m; 782 int avail, end, flind, freelist, i, oind, pind; 783 784 KASSERT(domain >= 0 && domain < vm_ndomains, 785 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 786 KASSERT(pool < VM_NFREEPOOL, 787 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 788 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 789 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 790 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 791 i = 0; 792 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 793 flind = vm_freelist_to_flind[freelist]; 794 if (flind < 0) 795 continue; 796 fl = vm_phys_free_queues[domain][flind][pool]; 797 for (oind = 0; oind < VM_NFREEORDER; oind++) { 798 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 799 vm_freelist_rem(fl, m, oind); 800 avail = i + (1 << oind); 801 end = imin(npages, avail); 802 ma[i++] = m++; 803 while (i < end) { 804 m->pool = pool; 805 ma[i++] = m++; 806 } 807 if (i == npages) { 808 /* 809 * Return excess pages to fl. Its order 810 * [0, oind) queues are empty. 811 */ 812 vm_phys_enq_range(m, avail - i, fl, 813 pool, 1); 814 return (npages); 815 } 816 } 817 } 818 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 819 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 820 alt = vm_phys_free_queues[domain][flind][pind]; 821 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 822 NULL) { 823 vm_freelist_rem(alt, m, oind); 824 avail = i + (1 << oind); 825 end = imin(npages, avail); 826 do { 827 m->pool = pool; 828 ma[i++] = m++; 829 } while (i < end); 830 if (i == npages) { 831 /* 832 * Return excess pages to fl. 833 * Its order [0, oind) queues 834 * are empty. 835 */ 836 vm_phys_enq_range(m, avail - i, 837 fl, pool, 1); 838 return (npages); 839 } 840 } 841 } 842 } 843 } 844 return (i); 845 } 846 847 /* 848 * Allocate a contiguous, power of two-sized set of physical pages 849 * from the free lists. Sets the pool field in the first page only. 850 * 851 * The free page queues must be locked. 852 */ 853 vm_page_t 854 vm_phys_alloc_pages(int domain, int pool, int order) 855 { 856 vm_page_t m; 857 int freelist; 858 859 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 860 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 861 if (m != NULL) 862 return (m); 863 } 864 return (NULL); 865 } 866 867 /* 868 * Allocate a contiguous, power of two-sized set of physical pages from the 869 * specified free list. The free list must be specified using one of the 870 * manifest constants VM_FREELIST_*. Sets the pool field in the first page 871 * only. 872 * 873 * The free page queues must be locked. 874 */ 875 vm_page_t 876 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 877 { 878 struct vm_freelist *alt, *fl; 879 vm_page_t m; 880 int oind, pind, flind; 881 882 KASSERT(domain >= 0 && domain < vm_ndomains, 883 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 884 domain)); 885 KASSERT(freelist < VM_NFREELIST, 886 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 887 freelist)); 888 KASSERT(pool < VM_NFREEPOOL, 889 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 890 KASSERT(order < VM_NFREEORDER, 891 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 892 893 flind = vm_freelist_to_flind[freelist]; 894 /* Check if freelist is present */ 895 if (flind < 0) 896 return (NULL); 897 898 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 899 fl = &vm_phys_free_queues[domain][flind][pool][0]; 900 for (oind = order; oind < VM_NFREEORDER; oind++) { 901 m = TAILQ_FIRST(&fl[oind].pl); 902 if (m != NULL) { 903 vm_freelist_rem(fl, m, oind); 904 /* The order [order, oind) queues are empty. */ 905 vm_phys_split_pages(m, oind, fl, order, 1); 906 return (m); 907 } 908 } 909 910 /* 911 * The given pool was empty. Find the largest 912 * contiguous, power-of-two-sized set of pages in any 913 * pool. Transfer these pages to the given pool, and 914 * use them to satisfy the allocation. 915 */ 916 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 917 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 918 alt = &vm_phys_free_queues[domain][flind][pind][0]; 919 m = TAILQ_FIRST(&alt[oind].pl); 920 if (m != NULL) { 921 vm_freelist_rem(alt, m, oind); 922 m->pool = pool; 923 /* The order [order, oind) queues are empty. */ 924 vm_phys_split_pages(m, oind, fl, order, 1); 925 return (m); 926 } 927 } 928 } 929 return (NULL); 930 } 931 932 /* 933 * Find the vm_page corresponding to the given physical address. 934 */ 935 vm_page_t 936 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 937 { 938 struct vm_phys_seg *seg; 939 940 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 941 return (&seg->first_page[atop(pa - seg->start)]); 942 return (NULL); 943 } 944 945 vm_page_t 946 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 947 { 948 struct vm_phys_fictitious_seg tmp, *seg; 949 vm_page_t m; 950 951 m = NULL; 952 tmp.start = pa; 953 tmp.end = 0; 954 955 rw_rlock(&vm_phys_fictitious_reg_lock); 956 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 957 rw_runlock(&vm_phys_fictitious_reg_lock); 958 if (seg == NULL) 959 return (NULL); 960 961 m = &seg->first_page[atop(pa - seg->start)]; 962 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 963 964 return (m); 965 } 966 967 static inline void 968 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 969 long page_count, vm_memattr_t memattr) 970 { 971 long i; 972 973 bzero(range, page_count * sizeof(*range)); 974 for (i = 0; i < page_count; i++) { 975 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 976 range[i].oflags &= ~VPO_UNMANAGED; 977 range[i].busy_lock = VPB_UNBUSIED; 978 } 979 } 980 981 int 982 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 983 vm_memattr_t memattr) 984 { 985 struct vm_phys_fictitious_seg *seg; 986 vm_page_t fp; 987 long page_count; 988 #ifdef VM_PHYSSEG_DENSE 989 long pi, pe; 990 long dpage_count; 991 #endif 992 993 KASSERT(start < end, 994 ("Start of segment isn't less than end (start: %jx end: %jx)", 995 (uintmax_t)start, (uintmax_t)end)); 996 997 page_count = (end - start) / PAGE_SIZE; 998 999 #ifdef VM_PHYSSEG_DENSE 1000 pi = atop(start); 1001 pe = atop(end); 1002 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1003 fp = &vm_page_array[pi - first_page]; 1004 if ((pe - first_page) > vm_page_array_size) { 1005 /* 1006 * We have a segment that starts inside 1007 * of vm_page_array, but ends outside of it. 1008 * 1009 * Use vm_page_array pages for those that are 1010 * inside of the vm_page_array range, and 1011 * allocate the remaining ones. 1012 */ 1013 dpage_count = vm_page_array_size - (pi - first_page); 1014 vm_phys_fictitious_init_range(fp, start, dpage_count, 1015 memattr); 1016 page_count -= dpage_count; 1017 start += ptoa(dpage_count); 1018 goto alloc; 1019 } 1020 /* 1021 * We can allocate the full range from vm_page_array, 1022 * so there's no need to register the range in the tree. 1023 */ 1024 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1025 return (0); 1026 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1027 /* 1028 * We have a segment that ends inside of vm_page_array, 1029 * but starts outside of it. 1030 */ 1031 fp = &vm_page_array[0]; 1032 dpage_count = pe - first_page; 1033 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 1034 memattr); 1035 end -= ptoa(dpage_count); 1036 page_count -= dpage_count; 1037 goto alloc; 1038 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1039 /* 1040 * Trying to register a fictitious range that expands before 1041 * and after vm_page_array. 1042 */ 1043 return (EINVAL); 1044 } else { 1045 alloc: 1046 #endif 1047 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1048 M_WAITOK); 1049 #ifdef VM_PHYSSEG_DENSE 1050 } 1051 #endif 1052 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1053 1054 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1055 seg->start = start; 1056 seg->end = end; 1057 seg->first_page = fp; 1058 1059 rw_wlock(&vm_phys_fictitious_reg_lock); 1060 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1061 rw_wunlock(&vm_phys_fictitious_reg_lock); 1062 1063 return (0); 1064 } 1065 1066 void 1067 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1068 { 1069 struct vm_phys_fictitious_seg *seg, tmp; 1070 #ifdef VM_PHYSSEG_DENSE 1071 long pi, pe; 1072 #endif 1073 1074 KASSERT(start < end, 1075 ("Start of segment isn't less than end (start: %jx end: %jx)", 1076 (uintmax_t)start, (uintmax_t)end)); 1077 1078 #ifdef VM_PHYSSEG_DENSE 1079 pi = atop(start); 1080 pe = atop(end); 1081 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1082 if ((pe - first_page) <= vm_page_array_size) { 1083 /* 1084 * This segment was allocated using vm_page_array 1085 * only, there's nothing to do since those pages 1086 * were never added to the tree. 1087 */ 1088 return; 1089 } 1090 /* 1091 * We have a segment that starts inside 1092 * of vm_page_array, but ends outside of it. 1093 * 1094 * Calculate how many pages were added to the 1095 * tree and free them. 1096 */ 1097 start = ptoa(first_page + vm_page_array_size); 1098 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1099 /* 1100 * We have a segment that ends inside of vm_page_array, 1101 * but starts outside of it. 1102 */ 1103 end = ptoa(first_page); 1104 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1105 /* Since it's not possible to register such a range, panic. */ 1106 panic( 1107 "Unregistering not registered fictitious range [%#jx:%#jx]", 1108 (uintmax_t)start, (uintmax_t)end); 1109 } 1110 #endif 1111 tmp.start = start; 1112 tmp.end = 0; 1113 1114 rw_wlock(&vm_phys_fictitious_reg_lock); 1115 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1116 if (seg->start != start || seg->end != end) { 1117 rw_wunlock(&vm_phys_fictitious_reg_lock); 1118 panic( 1119 "Unregistering not registered fictitious range [%#jx:%#jx]", 1120 (uintmax_t)start, (uintmax_t)end); 1121 } 1122 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1123 rw_wunlock(&vm_phys_fictitious_reg_lock); 1124 free(seg->first_page, M_FICT_PAGES); 1125 free(seg, M_FICT_PAGES); 1126 } 1127 1128 /* 1129 * Free a contiguous, power of two-sized set of physical pages. Assumes that 1130 * only the first page has a valid pool field. 1131 * 1132 * The free page queues must be locked. 1133 */ 1134 void 1135 vm_phys_free_pages(vm_page_t m, int order) 1136 { 1137 struct vm_freelist *fl; 1138 struct vm_phys_seg *seg; 1139 vm_paddr_t pa; 1140 vm_page_t m_buddy; 1141 int pool = m->pool; 1142 1143 KASSERT(m->order == VM_NFREEORDER, 1144 ("vm_phys_free_pages: page %p has unexpected order %d", 1145 m, m->order)); 1146 KASSERT(pool < VM_NFREEPOOL, 1147 ("vm_phys_free_pages: page %p has unexpected pool %d", m, pool)); 1148 KASSERT(order < VM_NFREEORDER, 1149 ("vm_phys_free_pages: order %d is out of range", order)); 1150 seg = &vm_phys_segs[m->segind]; 1151 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1152 if (order < VM_NFREEORDER - 1) { 1153 vm_page_t m_start = m; 1154 pa = VM_PAGE_TO_PHYS(m); 1155 do { 1156 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1157 if (pa < seg->start || pa >= seg->end) 1158 break; 1159 m_buddy = &seg->first_page[atop(pa - seg->start)]; 1160 if (m_buddy->order != order) 1161 break; 1162 fl = (*seg->free_queues)[m_buddy->pool]; 1163 vm_freelist_rem(fl, m_buddy, order); 1164 m_buddy->pool = VM_NFREEPOOL; 1165 order++; 1166 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1167 m = &seg->first_page[atop(pa - seg->start)]; 1168 } while (order < VM_NFREEORDER - 1); 1169 if (m != m_start) { 1170 m_start->pool = VM_NFREEPOOL; 1171 m->pool = pool; 1172 } 1173 } 1174 fl = (*seg->free_queues)[pool]; 1175 vm_freelist_add(fl, m, order, 1); 1176 } 1177 1178 /* 1179 * Free a contiguous, arbitrarily sized set of physical pages, without merging 1180 * across set boundaries. Assumes no pages have a valid pool field. 1181 * 1182 * The free page queues must be locked. 1183 */ 1184 void 1185 vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages) 1186 { 1187 struct vm_freelist *fl; 1188 struct vm_phys_seg *seg; 1189 vm_page_t m_end; 1190 vm_paddr_t diff, lo; 1191 int order; 1192 1193 /* 1194 * Avoid unnecessary coalescing by freeing the pages in the largest 1195 * possible power-of-two-sized subsets. 1196 */ 1197 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1198 seg = &vm_phys_segs[m->segind]; 1199 fl = (*seg->free_queues)[pool]; 1200 m_end = m + npages; 1201 /* Free blocks of increasing size. */ 1202 lo = atop(VM_PAGE_TO_PHYS(m)); 1203 if (m < m_end && 1204 (diff = lo ^ (lo + npages - 1)) != 0) { 1205 order = min(flsll(diff) - 1, VM_NFREEORDER - 1); 1206 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1207 pool, 1); 1208 } 1209 1210 /* Free blocks of maximum size. */ 1211 order = VM_NFREEORDER - 1; 1212 while (m + (1 << order) <= m_end) { 1213 KASSERT(seg == &vm_phys_segs[m->segind], 1214 ("%s: page range [%p,%p) spans multiple segments", 1215 __func__, m_end - npages, m)); 1216 m->pool = pool; 1217 vm_freelist_add(fl, m, order, 1); 1218 m += 1 << order; 1219 } 1220 /* Free blocks of diminishing size. */ 1221 vm_phys_enq_beg(m, m_end - m, fl, pool, 1); 1222 } 1223 1224 /* 1225 * Free a contiguous, arbitrarily sized set of physical pages. 1226 * Assumes that every page has the same, valid, pool field value. 1227 * 1228 * The free page queues must be locked. 1229 */ 1230 void 1231 vm_phys_free_contig(vm_page_t m, u_long npages) 1232 { 1233 vm_paddr_t lo; 1234 vm_page_t m_start, m_end; 1235 unsigned max_order, order_start, order_end; 1236 int pool = m->pool; 1237 1238 KASSERT(pool < VM_NFREEPOOL, 1239 ("%s: pool %d is out of range", __func__, pool)); 1240 1241 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1242 1243 lo = atop(VM_PAGE_TO_PHYS(m)); 1244 max_order = min(flsll(lo ^ (lo + npages)) - 1, VM_NFREEORDER - 1); 1245 m_end = m + npages; 1246 for (m_start = m; m < m_end; m++) 1247 m->pool = VM_NFREEPOOL; 1248 m = m_start; 1249 order_start = ffsll(lo) - 1; 1250 if (order_start < max_order) 1251 m_start += 1 << order_start; 1252 order_end = ffsll(lo + npages) - 1; 1253 if (order_end < max_order) 1254 m_end -= 1 << order_end; 1255 /* 1256 * Avoid unnecessary coalescing by freeing the pages at the start and 1257 * end of the range last. 1258 */ 1259 if (m_start < m_end) 1260 vm_phys_enqueue_contig(m_start, pool, m_end - m_start); 1261 if (order_start < max_order) { 1262 m->pool = pool; 1263 vm_phys_free_pages(m, order_start); 1264 } 1265 if (order_end < max_order) { 1266 m_end->pool = pool; 1267 vm_phys_free_pages(m_end, order_end); 1268 } 1269 } 1270 1271 /* 1272 * Identify the first address range within segment segind or greater 1273 * that matches the domain, lies within the low/high range, and has 1274 * enough pages. Return -1 if there is none. 1275 */ 1276 int 1277 vm_phys_find_range(vm_page_t bounds[], int segind, int domain, 1278 u_long npages, vm_paddr_t low, vm_paddr_t high) 1279 { 1280 vm_paddr_t pa_end, pa_start; 1281 struct vm_phys_seg *end_seg, *seg; 1282 1283 KASSERT(npages > 0, ("npages is zero")); 1284 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range")); 1285 end_seg = &vm_phys_segs[vm_phys_nsegs]; 1286 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) { 1287 if (seg->domain != domain) 1288 continue; 1289 if (seg->start >= high) 1290 return (-1); 1291 pa_start = MAX(low, seg->start); 1292 pa_end = MIN(high, seg->end); 1293 if (pa_end - pa_start < ptoa(npages)) 1294 continue; 1295 bounds[0] = &seg->first_page[atop(pa_start - seg->start)]; 1296 bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; 1297 return (seg - vm_phys_segs); 1298 } 1299 return (-1); 1300 } 1301 1302 /* 1303 * Search for the given physical page "m" in the free lists. If the search 1304 * succeeds, remove "m" from the free lists and return true. Otherwise, return 1305 * false, indicating that "m" is not in the free lists. 1306 * 1307 * The free page queues must be locked. 1308 */ 1309 bool 1310 vm_phys_unfree_page(vm_page_t m) 1311 { 1312 struct vm_freelist *fl; 1313 struct vm_phys_seg *seg; 1314 vm_paddr_t pa, pa_half; 1315 vm_page_t m_set, m_tmp; 1316 int order, pool; 1317 1318 /* 1319 * First, find the contiguous, power of two-sized set of free 1320 * physical pages containing the given physical page "m" and 1321 * assign it to "m_set". 1322 */ 1323 seg = &vm_phys_segs[m->segind]; 1324 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1325 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1326 order < VM_NFREEORDER - 1; ) { 1327 order++; 1328 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1329 if (pa >= seg->start) 1330 m_set = &seg->first_page[atop(pa - seg->start)]; 1331 else 1332 return (false); 1333 } 1334 if (m_set->order < order) 1335 return (false); 1336 if (m_set->order == VM_NFREEORDER) 1337 return (false); 1338 KASSERT(m_set->order < VM_NFREEORDER, 1339 ("vm_phys_unfree_page: page %p has unexpected order %d", 1340 m_set, m_set->order)); 1341 1342 /* 1343 * Next, remove "m_set" from the free lists. Finally, extract 1344 * "m" from "m_set" using an iterative algorithm: While "m_set" 1345 * is larger than a page, shrink "m_set" by returning the half 1346 * of "m_set" that does not contain "m" to the free lists. 1347 */ 1348 pool = m_set->pool; 1349 fl = (*seg->free_queues)[pool]; 1350 order = m_set->order; 1351 vm_freelist_rem(fl, m_set, order); 1352 while (order > 0) { 1353 order--; 1354 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1355 if (m->phys_addr < pa_half) 1356 m_tmp = &seg->first_page[atop(pa_half - seg->start)]; 1357 else { 1358 m_tmp = m_set; 1359 m_set = &seg->first_page[atop(pa_half - seg->start)]; 1360 } 1361 m_tmp->pool = pool; 1362 vm_freelist_add(fl, m_tmp, order, 0); 1363 } 1364 m_set->pool = pool; 1365 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1366 return (true); 1367 } 1368 1369 /* 1370 * Find a run of contiguous physical pages, meeting alignment requirements, from 1371 * a list of max-sized page blocks, where we need at least two consecutive 1372 * blocks to satisfy the (large) page request. 1373 */ 1374 static vm_page_t 1375 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages, 1376 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1377 { 1378 struct vm_phys_seg *seg; 1379 vm_page_t m, m_iter, m_ret; 1380 vm_paddr_t max_size, size; 1381 int max_order; 1382 1383 max_order = VM_NFREEORDER - 1; 1384 size = npages << PAGE_SHIFT; 1385 max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order); 1386 KASSERT(size > max_size, ("size is too small")); 1387 1388 /* 1389 * In order to avoid examining any free max-sized page block more than 1390 * twice, identify the ones that are first in a physically-contiguous 1391 * sequence of such blocks, and only for those walk the sequence to 1392 * check if there are enough free blocks starting at a properly aligned 1393 * block. Thus, no block is checked for free-ness more than twice. 1394 */ 1395 TAILQ_FOREACH(m, &fl[max_order].pl, listq) { 1396 /* 1397 * Skip m unless it is first in a sequence of free max page 1398 * blocks >= low in its segment. 1399 */ 1400 seg = &vm_phys_segs[m->segind]; 1401 if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start)) 1402 continue; 1403 if (VM_PAGE_TO_PHYS(m) >= max_size && 1404 VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) && 1405 max_order == m[-1 << max_order].order) 1406 continue; 1407 1408 /* 1409 * Advance m_ret from m to the first of the sequence, if any, 1410 * that satisfies alignment conditions and might leave enough 1411 * space. 1412 */ 1413 m_ret = m; 1414 while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret), 1415 size, alignment, boundary) && 1416 VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) && 1417 max_order == m_ret[1 << max_order].order) 1418 m_ret += 1 << max_order; 1419 1420 /* 1421 * Skip m unless some block m_ret in the sequence is properly 1422 * aligned, and begins a sequence of enough pages less than 1423 * high, and in the same segment. 1424 */ 1425 if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end)) 1426 continue; 1427 1428 /* 1429 * Skip m unless the blocks to allocate starting at m_ret are 1430 * all free. 1431 */ 1432 for (m_iter = m_ret; 1433 m_iter < m_ret + npages && max_order == m_iter->order; 1434 m_iter += 1 << max_order) { 1435 } 1436 if (m_iter < m_ret + npages) 1437 continue; 1438 return (m_ret); 1439 } 1440 return (NULL); 1441 } 1442 1443 /* 1444 * Find a run of contiguous physical pages from the specified free list 1445 * table. 1446 */ 1447 static vm_page_t 1448 vm_phys_find_queues_contig( 1449 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], 1450 u_long npages, vm_paddr_t low, vm_paddr_t high, 1451 u_long alignment, vm_paddr_t boundary) 1452 { 1453 struct vm_freelist *fl; 1454 vm_page_t m_ret; 1455 vm_paddr_t pa, pa_end, size; 1456 int oind, order, pind; 1457 1458 KASSERT(npages > 0, ("npages is 0")); 1459 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1460 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1461 /* Compute the queue that is the best fit for npages. */ 1462 order = flsl(npages - 1); 1463 /* Search for a large enough free block. */ 1464 size = npages << PAGE_SHIFT; 1465 for (oind = order; oind < VM_NFREEORDER; oind++) { 1466 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1467 fl = (*queues)[pind]; 1468 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1469 /* 1470 * Determine if the address range starting at pa 1471 * is within the given range, satisfies the 1472 * given alignment, and does not cross the given 1473 * boundary. 1474 */ 1475 pa = VM_PAGE_TO_PHYS(m_ret); 1476 pa_end = pa + size; 1477 if (low <= pa && pa_end <= high && 1478 vm_addr_ok(pa, size, alignment, boundary)) 1479 return (m_ret); 1480 } 1481 } 1482 } 1483 if (order < VM_NFREEORDER) 1484 return (NULL); 1485 /* Search for a long-enough sequence of max-order blocks. */ 1486 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1487 fl = (*queues)[pind]; 1488 m_ret = vm_phys_find_freelist_contig(fl, npages, 1489 low, high, alignment, boundary); 1490 if (m_ret != NULL) 1491 return (m_ret); 1492 } 1493 return (NULL); 1494 } 1495 1496 /* 1497 * Allocate a contiguous set of physical pages of the given size 1498 * "npages" from the free lists. All of the physical pages must be at 1499 * or above the given physical address "low" and below the given 1500 * physical address "high". The given value "alignment" determines the 1501 * alignment of the first physical page in the set. If the given value 1502 * "boundary" is non-zero, then the set of physical pages cannot cross 1503 * any physical address boundary that is a multiple of that value. Both 1504 * "alignment" and "boundary" must be a power of two. Sets the pool 1505 * field in every allocated page. 1506 */ 1507 vm_page_t 1508 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1509 u_long alignment, vm_paddr_t boundary) 1510 { 1511 vm_paddr_t pa_end, pa_start; 1512 struct vm_freelist *fl; 1513 vm_page_t m, m_run; 1514 struct vm_phys_seg *seg; 1515 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; 1516 int oind, segind; 1517 1518 KASSERT(npages > 0, ("npages is 0")); 1519 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1520 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1521 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1522 if (low >= high) 1523 return (NULL); 1524 queues = NULL; 1525 m_run = NULL; 1526 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1527 seg = &vm_phys_segs[segind]; 1528 if (seg->start >= high || seg->domain != domain) 1529 continue; 1530 if (low >= seg->end) 1531 break; 1532 if (low <= seg->start) 1533 pa_start = seg->start; 1534 else 1535 pa_start = low; 1536 if (high < seg->end) 1537 pa_end = high; 1538 else 1539 pa_end = seg->end; 1540 if (pa_end - pa_start < ptoa(npages)) 1541 continue; 1542 /* 1543 * If a previous segment led to a search using 1544 * the same free lists as would this segment, then 1545 * we've actually already searched within this 1546 * too. So skip it. 1547 */ 1548 if (seg->free_queues == queues) 1549 continue; 1550 queues = seg->free_queues; 1551 m_run = vm_phys_find_queues_contig(queues, npages, 1552 low, high, alignment, boundary); 1553 if (m_run != NULL) 1554 break; 1555 } 1556 if (m_run == NULL) 1557 return (NULL); 1558 1559 /* Allocate pages from the page-range found. */ 1560 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { 1561 fl = (*queues)[m->pool]; 1562 oind = m->order; 1563 vm_freelist_rem(fl, m, oind); 1564 } 1565 /* Return excess pages to the free lists. */ 1566 fl = (*queues)[VM_FREEPOOL_DEFAULT]; 1567 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 1568 VM_FREEPOOL_DEFAULT, 0); 1569 1570 /* Return page verified to satisfy conditions of request. */ 1571 for (m = m_run; m < &m_run[npages]; m++) 1572 m->pool = VM_FREEPOOL_DEFAULT; 1573 1574 pa_start = VM_PAGE_TO_PHYS(m_run); 1575 KASSERT(low <= pa_start, 1576 ("memory allocated below minimum requested range")); 1577 KASSERT(pa_start + ptoa(npages) <= high, 1578 ("memory allocated above maximum requested range")); 1579 seg = &vm_phys_segs[m_run->segind]; 1580 KASSERT(seg->domain == domain, 1581 ("memory not allocated from specified domain")); 1582 KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary), 1583 ("memory alignment/boundary constraints not satisfied")); 1584 return (m_run); 1585 } 1586 1587 /* 1588 * Return the index of the first unused slot which may be the terminating 1589 * entry. 1590 */ 1591 static int 1592 vm_phys_avail_count(void) 1593 { 1594 int i; 1595 1596 for (i = 0; phys_avail[i + 1]; i += 2) 1597 continue; 1598 if (i > PHYS_AVAIL_ENTRIES) 1599 panic("Improperly terminated phys_avail %d entries", i); 1600 1601 return (i); 1602 } 1603 1604 /* 1605 * Assert that a phys_avail entry is valid. 1606 */ 1607 static void 1608 vm_phys_avail_check(int i) 1609 { 1610 if (phys_avail[i] & PAGE_MASK) 1611 panic("Unaligned phys_avail[%d]: %#jx", i, 1612 (intmax_t)phys_avail[i]); 1613 if (phys_avail[i+1] & PAGE_MASK) 1614 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1615 (intmax_t)phys_avail[i]); 1616 if (phys_avail[i + 1] < phys_avail[i]) 1617 panic("phys_avail[%d] start %#jx < end %#jx", i, 1618 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1619 } 1620 1621 /* 1622 * Return the index of an overlapping phys_avail entry or -1. 1623 */ 1624 #ifdef NUMA 1625 static int 1626 vm_phys_avail_find(vm_paddr_t pa) 1627 { 1628 int i; 1629 1630 for (i = 0; phys_avail[i + 1]; i += 2) 1631 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1632 return (i); 1633 return (-1); 1634 } 1635 #endif 1636 1637 /* 1638 * Return the index of the largest entry. 1639 */ 1640 int 1641 vm_phys_avail_largest(void) 1642 { 1643 vm_paddr_t sz, largesz; 1644 int largest; 1645 int i; 1646 1647 largest = 0; 1648 largesz = 0; 1649 for (i = 0; phys_avail[i + 1]; i += 2) { 1650 sz = vm_phys_avail_size(i); 1651 if (sz > largesz) { 1652 largesz = sz; 1653 largest = i; 1654 } 1655 } 1656 1657 return (largest); 1658 } 1659 1660 vm_paddr_t 1661 vm_phys_avail_size(int i) 1662 { 1663 1664 return (phys_avail[i + 1] - phys_avail[i]); 1665 } 1666 1667 /* 1668 * Split an entry at the address 'pa'. Return zero on success or errno. 1669 */ 1670 static int 1671 vm_phys_avail_split(vm_paddr_t pa, int i) 1672 { 1673 int cnt; 1674 1675 vm_phys_avail_check(i); 1676 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1677 panic("vm_phys_avail_split: invalid address"); 1678 cnt = vm_phys_avail_count(); 1679 if (cnt >= PHYS_AVAIL_ENTRIES) 1680 return (ENOSPC); 1681 memmove(&phys_avail[i + 2], &phys_avail[i], 1682 (cnt - i) * sizeof(phys_avail[0])); 1683 phys_avail[i + 1] = pa; 1684 phys_avail[i + 2] = pa; 1685 vm_phys_avail_check(i); 1686 vm_phys_avail_check(i+2); 1687 1688 return (0); 1689 } 1690 1691 /* 1692 * Check if a given physical address can be included as part of a crash dump. 1693 */ 1694 bool 1695 vm_phys_is_dumpable(vm_paddr_t pa) 1696 { 1697 vm_page_t m; 1698 int i; 1699 1700 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1701 return ((m->flags & PG_NODUMP) == 0); 1702 1703 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1704 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1705 return (true); 1706 } 1707 return (false); 1708 } 1709 1710 void 1711 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1712 { 1713 struct vm_phys_seg *seg; 1714 1715 if (vm_phys_early_nsegs == -1) 1716 panic("%s: called after initialization", __func__); 1717 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1718 panic("%s: ran out of early segments", __func__); 1719 1720 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1721 seg->start = start; 1722 seg->end = end; 1723 } 1724 1725 /* 1726 * This routine allocates NUMA node specific memory before the page 1727 * allocator is bootstrapped. 1728 */ 1729 vm_paddr_t 1730 vm_phys_early_alloc(int domain, size_t alloc_size) 1731 { 1732 #ifdef NUMA 1733 int mem_index; 1734 #endif 1735 int i, biggestone; 1736 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1737 1738 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1739 ("%s: invalid domain index %d", __func__, domain)); 1740 1741 /* 1742 * Search the mem_affinity array for the biggest address 1743 * range in the desired domain. This is used to constrain 1744 * the phys_avail selection below. 1745 */ 1746 biggestsize = 0; 1747 mem_start = 0; 1748 mem_end = -1; 1749 #ifdef NUMA 1750 mem_index = 0; 1751 if (mem_affinity != NULL) { 1752 for (i = 0;; i++) { 1753 size = mem_affinity[i].end - mem_affinity[i].start; 1754 if (size == 0) 1755 break; 1756 if (domain != -1 && mem_affinity[i].domain != domain) 1757 continue; 1758 if (size > biggestsize) { 1759 mem_index = i; 1760 biggestsize = size; 1761 } 1762 } 1763 mem_start = mem_affinity[mem_index].start; 1764 mem_end = mem_affinity[mem_index].end; 1765 } 1766 #endif 1767 1768 /* 1769 * Now find biggest physical segment in within the desired 1770 * numa domain. 1771 */ 1772 biggestsize = 0; 1773 biggestone = 0; 1774 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1775 /* skip regions that are out of range */ 1776 if (phys_avail[i+1] - alloc_size < mem_start || 1777 phys_avail[i+1] > mem_end) 1778 continue; 1779 size = vm_phys_avail_size(i); 1780 if (size > biggestsize) { 1781 biggestone = i; 1782 biggestsize = size; 1783 } 1784 } 1785 alloc_size = round_page(alloc_size); 1786 1787 /* 1788 * Grab single pages from the front to reduce fragmentation. 1789 */ 1790 if (alloc_size == PAGE_SIZE) { 1791 pa = phys_avail[biggestone]; 1792 phys_avail[biggestone] += PAGE_SIZE; 1793 vm_phys_avail_check(biggestone); 1794 return (pa); 1795 } 1796 1797 /* 1798 * Naturally align large allocations. 1799 */ 1800 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1801 if (alloc_size + align > biggestsize) 1802 panic("cannot find a large enough size\n"); 1803 if (align != 0 && 1804 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1805 biggestone) != 0) 1806 /* Wasting memory. */ 1807 phys_avail[biggestone + 1] -= align; 1808 1809 phys_avail[biggestone + 1] -= alloc_size; 1810 vm_phys_avail_check(biggestone); 1811 pa = phys_avail[biggestone + 1]; 1812 return (pa); 1813 } 1814 1815 void 1816 vm_phys_early_startup(void) 1817 { 1818 struct vm_phys_seg *seg; 1819 int i; 1820 1821 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1822 phys_avail[i] = round_page(phys_avail[i]); 1823 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1824 } 1825 1826 for (i = 0; i < vm_phys_early_nsegs; i++) { 1827 seg = &vm_phys_early_segs[i]; 1828 vm_phys_add_seg(seg->start, seg->end); 1829 } 1830 vm_phys_early_nsegs = -1; 1831 1832 #ifdef NUMA 1833 /* Force phys_avail to be split by domain. */ 1834 if (mem_affinity != NULL) { 1835 int idx; 1836 1837 for (i = 0; mem_affinity[i].end != 0; i++) { 1838 idx = vm_phys_avail_find(mem_affinity[i].start); 1839 if (idx != -1 && 1840 phys_avail[idx] != mem_affinity[i].start) 1841 vm_phys_avail_split(mem_affinity[i].start, idx); 1842 idx = vm_phys_avail_find(mem_affinity[i].end); 1843 if (idx != -1 && 1844 phys_avail[idx] != mem_affinity[i].end) 1845 vm_phys_avail_split(mem_affinity[i].end, idx); 1846 } 1847 } 1848 #endif 1849 } 1850 1851 #ifdef DDB 1852 /* 1853 * Show the number of physical pages in each of the free lists. 1854 */ 1855 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) 1856 { 1857 struct vm_freelist *fl; 1858 int flind, oind, pind, dom; 1859 1860 for (dom = 0; dom < vm_ndomains; dom++) { 1861 db_printf("DOMAIN: %d\n", dom); 1862 for (flind = 0; flind < vm_nfreelists; flind++) { 1863 db_printf("FREE LIST %d:\n" 1864 "\n ORDER (SIZE) | NUMBER" 1865 "\n ", flind); 1866 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1867 db_printf(" | POOL %d", pind); 1868 db_printf("\n-- "); 1869 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1870 db_printf("-- -- "); 1871 db_printf("--\n"); 1872 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 1873 db_printf(" %2.2d (%6.6dK)", oind, 1874 1 << (PAGE_SHIFT - 10 + oind)); 1875 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1876 fl = vm_phys_free_queues[dom][flind][pind]; 1877 db_printf(" | %6.6d", fl[oind].lcnt); 1878 } 1879 db_printf("\n"); 1880 } 1881 db_printf("\n"); 1882 } 1883 db_printf("\n"); 1884 } 1885 } 1886 #endif 1887