1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ddb.h" 45 #include "opt_vm.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/domainset.h> 50 #include <sys/lock.h> 51 #include <sys/kernel.h> 52 #include <sys/malloc.h> 53 #include <sys/mutex.h> 54 #include <sys/proc.h> 55 #include <sys/queue.h> 56 #include <sys/rwlock.h> 57 #include <sys/sbuf.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/vmmeter.h> 61 62 #include <ddb/ddb.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_param.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_page.h> 69 #include <vm/vm_phys.h> 70 #include <vm/vm_pagequeue.h> 71 72 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 73 "Too many physsegs."); 74 75 #ifdef NUMA 76 struct mem_affinity __read_mostly *mem_affinity; 77 int __read_mostly *mem_locality; 78 #endif 79 80 int __read_mostly vm_ndomains = 1; 81 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 82 83 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 84 int __read_mostly vm_phys_nsegs; 85 86 struct vm_phys_fictitious_seg; 87 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 88 struct vm_phys_fictitious_seg *); 89 90 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 91 RB_INITIALIZER(&vm_phys_fictitious_tree); 92 93 struct vm_phys_fictitious_seg { 94 RB_ENTRY(vm_phys_fictitious_seg) node; 95 /* Memory region data */ 96 vm_paddr_t start; 97 vm_paddr_t end; 98 vm_page_t first_page; 99 }; 100 101 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 102 vm_phys_fictitious_cmp); 103 104 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 105 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 106 107 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 108 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 109 [VM_NFREEORDER_MAX]; 110 111 static int __read_mostly vm_nfreelists; 112 113 /* 114 * These "avail lists" are globals used to communicate boot-time physical 115 * memory layout to other parts of the kernel. Each physically contiguous 116 * region of memory is defined by a start address at an even index and an 117 * end address at the following odd index. Each list is terminated by a 118 * pair of zero entries. 119 * 120 * dump_avail tells the dump code what regions to include in a crash dump, and 121 * phys_avail is all of the remaining physical memory that is available for 122 * the vm system. 123 * 124 * Initially dump_avail and phys_avail are identical. Boot time memory 125 * allocations remove extents from phys_avail that may still be included 126 * in dumps. 127 */ 128 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 129 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 130 131 /* 132 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 133 */ 134 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 135 136 CTASSERT(VM_FREELIST_DEFAULT == 0); 137 138 #ifdef VM_FREELIST_DMA32 139 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 140 #endif 141 142 /* 143 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 144 * the ordering of the free list boundaries. 145 */ 146 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 147 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 148 #endif 149 150 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 151 SYSCTL_OID(_vm, OID_AUTO, phys_free, 152 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, 153 sysctl_vm_phys_free, "A", 154 "Phys Free Info"); 155 156 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 157 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 158 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, 159 sysctl_vm_phys_segs, "A", 160 "Phys Seg Info"); 161 162 #ifdef NUMA 163 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 164 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 165 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, 166 sysctl_vm_phys_locality, "A", 167 "Phys Locality Info"); 168 #endif 169 170 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 171 &vm_ndomains, 0, "Number of physical memory domains available."); 172 173 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, 174 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 175 vm_paddr_t boundary); 176 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 177 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 178 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 179 int order, int tail); 180 181 /* 182 * Red-black tree helpers for vm fictitious range management. 183 */ 184 static inline int 185 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 186 struct vm_phys_fictitious_seg *range) 187 { 188 189 KASSERT(range->start != 0 && range->end != 0, 190 ("Invalid range passed on search for vm_fictitious page")); 191 if (p->start >= range->end) 192 return (1); 193 if (p->start < range->start) 194 return (-1); 195 196 return (0); 197 } 198 199 static int 200 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 201 struct vm_phys_fictitious_seg *p2) 202 { 203 204 /* Check if this is a search for a page */ 205 if (p1->end == 0) 206 return (vm_phys_fictitious_in_range(p1, p2)); 207 208 KASSERT(p2->end != 0, 209 ("Invalid range passed as second parameter to vm fictitious comparison")); 210 211 /* Searching to add a new range */ 212 if (p1->end <= p2->start) 213 return (-1); 214 if (p1->start >= p2->end) 215 return (1); 216 217 panic("Trying to add overlapping vm fictitious ranges:\n" 218 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 219 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 220 } 221 222 int 223 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high) 224 { 225 #ifdef NUMA 226 domainset_t mask; 227 int i; 228 229 if (vm_ndomains == 1 || mem_affinity == NULL) 230 return (0); 231 232 DOMAINSET_ZERO(&mask); 233 /* 234 * Check for any memory that overlaps low, high. 235 */ 236 for (i = 0; mem_affinity[i].end != 0; i++) 237 if (mem_affinity[i].start <= high && 238 mem_affinity[i].end >= low) 239 DOMAINSET_SET(mem_affinity[i].domain, &mask); 240 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 241 return (prefer); 242 if (DOMAINSET_EMPTY(&mask)) 243 panic("vm_phys_domain_match: Impossible constraint"); 244 return (DOMAINSET_FFS(&mask) - 1); 245 #else 246 return (0); 247 #endif 248 } 249 250 /* 251 * Outputs the state of the physical memory allocator, specifically, 252 * the amount of physical memory in each free list. 253 */ 254 static int 255 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 256 { 257 struct sbuf sbuf; 258 struct vm_freelist *fl; 259 int dom, error, flind, oind, pind; 260 261 error = sysctl_wire_old_buffer(req, 0); 262 if (error != 0) 263 return (error); 264 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 265 for (dom = 0; dom < vm_ndomains; dom++) { 266 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 267 for (flind = 0; flind < vm_nfreelists; flind++) { 268 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 269 "\n ORDER (SIZE) | NUMBER" 270 "\n ", flind); 271 for (pind = 0; pind < VM_NFREEPOOL; pind++) 272 sbuf_printf(&sbuf, " | POOL %d", pind); 273 sbuf_printf(&sbuf, "\n-- "); 274 for (pind = 0; pind < VM_NFREEPOOL; pind++) 275 sbuf_printf(&sbuf, "-- -- "); 276 sbuf_printf(&sbuf, "--\n"); 277 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 278 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 279 1 << (PAGE_SHIFT - 10 + oind)); 280 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 281 fl = vm_phys_free_queues[dom][flind][pind]; 282 sbuf_printf(&sbuf, " | %6d", 283 fl[oind].lcnt); 284 } 285 sbuf_printf(&sbuf, "\n"); 286 } 287 } 288 } 289 error = sbuf_finish(&sbuf); 290 sbuf_delete(&sbuf); 291 return (error); 292 } 293 294 /* 295 * Outputs the set of physical memory segments. 296 */ 297 static int 298 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 299 { 300 struct sbuf sbuf; 301 struct vm_phys_seg *seg; 302 int error, segind; 303 304 error = sysctl_wire_old_buffer(req, 0); 305 if (error != 0) 306 return (error); 307 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 308 for (segind = 0; segind < vm_phys_nsegs; segind++) { 309 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 310 seg = &vm_phys_segs[segind]; 311 sbuf_printf(&sbuf, "start: %#jx\n", 312 (uintmax_t)seg->start); 313 sbuf_printf(&sbuf, "end: %#jx\n", 314 (uintmax_t)seg->end); 315 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 316 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 317 } 318 error = sbuf_finish(&sbuf); 319 sbuf_delete(&sbuf); 320 return (error); 321 } 322 323 /* 324 * Return affinity, or -1 if there's no affinity information. 325 */ 326 int 327 vm_phys_mem_affinity(int f, int t) 328 { 329 330 #ifdef NUMA 331 if (mem_locality == NULL) 332 return (-1); 333 if (f >= vm_ndomains || t >= vm_ndomains) 334 return (-1); 335 return (mem_locality[f * vm_ndomains + t]); 336 #else 337 return (-1); 338 #endif 339 } 340 341 #ifdef NUMA 342 /* 343 * Outputs the VM locality table. 344 */ 345 static int 346 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 347 { 348 struct sbuf sbuf; 349 int error, i, j; 350 351 error = sysctl_wire_old_buffer(req, 0); 352 if (error != 0) 353 return (error); 354 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 355 356 sbuf_printf(&sbuf, "\n"); 357 358 for (i = 0; i < vm_ndomains; i++) { 359 sbuf_printf(&sbuf, "%d: ", i); 360 for (j = 0; j < vm_ndomains; j++) { 361 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 362 } 363 sbuf_printf(&sbuf, "\n"); 364 } 365 error = sbuf_finish(&sbuf); 366 sbuf_delete(&sbuf); 367 return (error); 368 } 369 #endif 370 371 static void 372 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 373 { 374 375 m->order = order; 376 if (tail) 377 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 378 else 379 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 380 fl[order].lcnt++; 381 } 382 383 static void 384 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 385 { 386 387 TAILQ_REMOVE(&fl[order].pl, m, listq); 388 fl[order].lcnt--; 389 m->order = VM_NFREEORDER; 390 } 391 392 /* 393 * Create a physical memory segment. 394 */ 395 static void 396 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 397 { 398 struct vm_phys_seg *seg; 399 400 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 401 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 402 KASSERT(domain >= 0 && domain < vm_ndomains, 403 ("vm_phys_create_seg: invalid domain provided")); 404 seg = &vm_phys_segs[vm_phys_nsegs++]; 405 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 406 *seg = *(seg - 1); 407 seg--; 408 } 409 seg->start = start; 410 seg->end = end; 411 seg->domain = domain; 412 } 413 414 static void 415 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 416 { 417 #ifdef NUMA 418 int i; 419 420 if (mem_affinity == NULL) { 421 _vm_phys_create_seg(start, end, 0); 422 return; 423 } 424 425 for (i = 0;; i++) { 426 if (mem_affinity[i].end == 0) 427 panic("Reached end of affinity info"); 428 if (mem_affinity[i].end <= start) 429 continue; 430 if (mem_affinity[i].start > start) 431 panic("No affinity info for start %jx", 432 (uintmax_t)start); 433 if (mem_affinity[i].end >= end) { 434 _vm_phys_create_seg(start, end, 435 mem_affinity[i].domain); 436 break; 437 } 438 _vm_phys_create_seg(start, mem_affinity[i].end, 439 mem_affinity[i].domain); 440 start = mem_affinity[i].end; 441 } 442 #else 443 _vm_phys_create_seg(start, end, 0); 444 #endif 445 } 446 447 /* 448 * Add a physical memory segment. 449 */ 450 void 451 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 452 { 453 vm_paddr_t paddr; 454 455 KASSERT((start & PAGE_MASK) == 0, 456 ("vm_phys_define_seg: start is not page aligned")); 457 KASSERT((end & PAGE_MASK) == 0, 458 ("vm_phys_define_seg: end is not page aligned")); 459 460 /* 461 * Split the physical memory segment if it spans two or more free 462 * list boundaries. 463 */ 464 paddr = start; 465 #ifdef VM_FREELIST_LOWMEM 466 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 467 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 468 paddr = VM_LOWMEM_BOUNDARY; 469 } 470 #endif 471 #ifdef VM_FREELIST_DMA32 472 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 473 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 474 paddr = VM_DMA32_BOUNDARY; 475 } 476 #endif 477 vm_phys_create_seg(paddr, end); 478 } 479 480 /* 481 * Initialize the physical memory allocator. 482 * 483 * Requires that vm_page_array is initialized! 484 */ 485 void 486 vm_phys_init(void) 487 { 488 struct vm_freelist *fl; 489 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 490 u_long npages; 491 int dom, flind, freelist, oind, pind, segind; 492 493 /* 494 * Compute the number of free lists, and generate the mapping from the 495 * manifest constants VM_FREELIST_* to the free list indices. 496 * 497 * Initially, the entries of vm_freelist_to_flind[] are set to either 498 * 0 or 1 to indicate which free lists should be created. 499 */ 500 npages = 0; 501 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 502 seg = &vm_phys_segs[segind]; 503 #ifdef VM_FREELIST_LOWMEM 504 if (seg->end <= VM_LOWMEM_BOUNDARY) 505 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 506 else 507 #endif 508 #ifdef VM_FREELIST_DMA32 509 if ( 510 #ifdef VM_DMA32_NPAGES_THRESHOLD 511 /* 512 * Create the DMA32 free list only if the amount of 513 * physical memory above physical address 4G exceeds the 514 * given threshold. 515 */ 516 npages > VM_DMA32_NPAGES_THRESHOLD && 517 #endif 518 seg->end <= VM_DMA32_BOUNDARY) 519 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 520 else 521 #endif 522 { 523 npages += atop(seg->end - seg->start); 524 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 525 } 526 } 527 /* Change each entry into a running total of the free lists. */ 528 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 529 vm_freelist_to_flind[freelist] += 530 vm_freelist_to_flind[freelist - 1]; 531 } 532 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 533 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 534 /* Change each entry into a free list index. */ 535 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 536 vm_freelist_to_flind[freelist]--; 537 538 /* 539 * Initialize the first_page and free_queues fields of each physical 540 * memory segment. 541 */ 542 #ifdef VM_PHYSSEG_SPARSE 543 npages = 0; 544 #endif 545 for (segind = 0; segind < vm_phys_nsegs; segind++) { 546 seg = &vm_phys_segs[segind]; 547 #ifdef VM_PHYSSEG_SPARSE 548 seg->first_page = &vm_page_array[npages]; 549 npages += atop(seg->end - seg->start); 550 #else 551 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 552 #endif 553 #ifdef VM_FREELIST_LOWMEM 554 if (seg->end <= VM_LOWMEM_BOUNDARY) { 555 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 556 KASSERT(flind >= 0, 557 ("vm_phys_init: LOWMEM flind < 0")); 558 } else 559 #endif 560 #ifdef VM_FREELIST_DMA32 561 if (seg->end <= VM_DMA32_BOUNDARY) { 562 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 563 KASSERT(flind >= 0, 564 ("vm_phys_init: DMA32 flind < 0")); 565 } else 566 #endif 567 { 568 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 569 KASSERT(flind >= 0, 570 ("vm_phys_init: DEFAULT flind < 0")); 571 } 572 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 573 } 574 575 /* 576 * Coalesce physical memory segments that are contiguous and share the 577 * same per-domain free queues. 578 */ 579 prev_seg = vm_phys_segs; 580 seg = &vm_phys_segs[1]; 581 end_seg = &vm_phys_segs[vm_phys_nsegs]; 582 while (seg < end_seg) { 583 if (prev_seg->end == seg->start && 584 prev_seg->free_queues == seg->free_queues) { 585 prev_seg->end = seg->end; 586 KASSERT(prev_seg->domain == seg->domain, 587 ("vm_phys_init: free queues cannot span domains")); 588 vm_phys_nsegs--; 589 end_seg--; 590 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 591 *tmp_seg = *(tmp_seg + 1); 592 } else { 593 prev_seg = seg; 594 seg++; 595 } 596 } 597 598 /* 599 * Initialize the free queues. 600 */ 601 for (dom = 0; dom < vm_ndomains; dom++) { 602 for (flind = 0; flind < vm_nfreelists; flind++) { 603 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 604 fl = vm_phys_free_queues[dom][flind][pind]; 605 for (oind = 0; oind < VM_NFREEORDER; oind++) 606 TAILQ_INIT(&fl[oind].pl); 607 } 608 } 609 } 610 611 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 612 } 613 614 /* 615 * Register info about the NUMA topology of the system. 616 * 617 * Invoked by platform-dependent code prior to vm_phys_init(). 618 */ 619 void 620 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, 621 int *locality) 622 { 623 #ifdef NUMA 624 int d, i; 625 626 /* 627 * For now the only override value that we support is 1, which 628 * effectively disables NUMA-awareness in the allocators. 629 */ 630 d = 0; 631 TUNABLE_INT_FETCH("vm.numa.disabled", &d); 632 if (d) 633 ndomains = 1; 634 635 if (ndomains > 1) { 636 vm_ndomains = ndomains; 637 mem_affinity = affinity; 638 mem_locality = locality; 639 } 640 641 for (i = 0; i < vm_ndomains; i++) 642 DOMAINSET_SET(i, &all_domains); 643 #else 644 (void)ndomains; 645 (void)affinity; 646 (void)locality; 647 #endif 648 } 649 650 int 651 _vm_phys_domain(vm_paddr_t pa) 652 { 653 #ifdef NUMA 654 int i; 655 656 if (vm_ndomains == 1 || mem_affinity == NULL) 657 return (0); 658 659 /* 660 * Check for any memory that overlaps. 661 */ 662 for (i = 0; mem_affinity[i].end != 0; i++) 663 if (mem_affinity[i].start <= pa && 664 mem_affinity[i].end >= pa) 665 return (mem_affinity[i].domain); 666 #endif 667 return (0); 668 } 669 670 /* 671 * Split a contiguous, power of two-sized set of physical pages. 672 * 673 * When this function is called by a page allocation function, the caller 674 * should request insertion at the head unless the order [order, oind) queues 675 * are known to be empty. The objective being to reduce the likelihood of 676 * long-term fragmentation by promoting contemporaneous allocation and 677 * (hopefully) deallocation. 678 */ 679 static __inline void 680 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 681 int tail) 682 { 683 vm_page_t m_buddy; 684 685 while (oind > order) { 686 oind--; 687 m_buddy = &m[1 << oind]; 688 KASSERT(m_buddy->order == VM_NFREEORDER, 689 ("vm_phys_split_pages: page %p has unexpected order %d", 690 m_buddy, m_buddy->order)); 691 vm_freelist_add(fl, m_buddy, oind, tail); 692 } 693 } 694 695 /* 696 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 697 * and sized set to the specified free list. 698 * 699 * When this function is called by a page allocation function, the caller 700 * should request insertion at the head unless the lower-order queues are 701 * known to be empty. The objective being to reduce the likelihood of long- 702 * term fragmentation by promoting contemporaneous allocation and (hopefully) 703 * deallocation. 704 * 705 * The physical page m's buddy must not be free. 706 */ 707 static void 708 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 709 { 710 u_int n; 711 int order; 712 713 KASSERT(npages > 0, ("vm_phys_enq_range: npages is 0")); 714 KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 715 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 716 ("vm_phys_enq_range: page %p and npages %u are misaligned", 717 m, npages)); 718 do { 719 KASSERT(m->order == VM_NFREEORDER, 720 ("vm_phys_enq_range: page %p has unexpected order %d", 721 m, m->order)); 722 order = ffs(npages) - 1; 723 KASSERT(order < VM_NFREEORDER, 724 ("vm_phys_enq_range: order %d is out of range", order)); 725 vm_freelist_add(fl, m, order, tail); 726 n = 1 << order; 727 m += n; 728 npages -= n; 729 } while (npages > 0); 730 } 731 732 /* 733 * Tries to allocate the specified number of pages from the specified pool 734 * within the specified domain. Returns the actual number of allocated pages 735 * and a pointer to each page through the array ma[]. 736 * 737 * The returned pages may not be physically contiguous. However, in contrast 738 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 739 * calling this function once to allocate the desired number of pages will 740 * avoid wasted time in vm_phys_split_pages(). 741 * 742 * The free page queues for the specified domain must be locked. 743 */ 744 int 745 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 746 { 747 struct vm_freelist *alt, *fl; 748 vm_page_t m; 749 int avail, end, flind, freelist, i, need, oind, pind; 750 751 KASSERT(domain >= 0 && domain < vm_ndomains, 752 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 753 KASSERT(pool < VM_NFREEPOOL, 754 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 755 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 756 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 757 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 758 i = 0; 759 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 760 flind = vm_freelist_to_flind[freelist]; 761 if (flind < 0) 762 continue; 763 fl = vm_phys_free_queues[domain][flind][pool]; 764 for (oind = 0; oind < VM_NFREEORDER; oind++) { 765 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 766 vm_freelist_rem(fl, m, oind); 767 avail = 1 << oind; 768 need = imin(npages - i, avail); 769 for (end = i + need; i < end;) 770 ma[i++] = m++; 771 if (need < avail) { 772 /* 773 * Return excess pages to fl. Its 774 * order [0, oind) queues are empty. 775 */ 776 vm_phys_enq_range(m, avail - need, fl, 777 1); 778 return (npages); 779 } else if (i == npages) 780 return (npages); 781 } 782 } 783 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 784 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 785 alt = vm_phys_free_queues[domain][flind][pind]; 786 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 787 NULL) { 788 vm_freelist_rem(alt, m, oind); 789 vm_phys_set_pool(pool, m, oind); 790 avail = 1 << oind; 791 need = imin(npages - i, avail); 792 for (end = i + need; i < end;) 793 ma[i++] = m++; 794 if (need < avail) { 795 /* 796 * Return excess pages to fl. 797 * Its order [0, oind) queues 798 * are empty. 799 */ 800 vm_phys_enq_range(m, avail - 801 need, fl, 1); 802 return (npages); 803 } else if (i == npages) 804 return (npages); 805 } 806 } 807 } 808 } 809 return (i); 810 } 811 812 /* 813 * Allocate a contiguous, power of two-sized set of physical pages 814 * from the free lists. 815 * 816 * The free page queues must be locked. 817 */ 818 vm_page_t 819 vm_phys_alloc_pages(int domain, int pool, int order) 820 { 821 vm_page_t m; 822 int freelist; 823 824 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 825 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 826 if (m != NULL) 827 return (m); 828 } 829 return (NULL); 830 } 831 832 /* 833 * Allocate a contiguous, power of two-sized set of physical pages from the 834 * specified free list. The free list must be specified using one of the 835 * manifest constants VM_FREELIST_*. 836 * 837 * The free page queues must be locked. 838 */ 839 vm_page_t 840 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 841 { 842 struct vm_freelist *alt, *fl; 843 vm_page_t m; 844 int oind, pind, flind; 845 846 KASSERT(domain >= 0 && domain < vm_ndomains, 847 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 848 domain)); 849 KASSERT(freelist < VM_NFREELIST, 850 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 851 freelist)); 852 KASSERT(pool < VM_NFREEPOOL, 853 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 854 KASSERT(order < VM_NFREEORDER, 855 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 856 857 flind = vm_freelist_to_flind[freelist]; 858 /* Check if freelist is present */ 859 if (flind < 0) 860 return (NULL); 861 862 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 863 fl = &vm_phys_free_queues[domain][flind][pool][0]; 864 for (oind = order; oind < VM_NFREEORDER; oind++) { 865 m = TAILQ_FIRST(&fl[oind].pl); 866 if (m != NULL) { 867 vm_freelist_rem(fl, m, oind); 868 /* The order [order, oind) queues are empty. */ 869 vm_phys_split_pages(m, oind, fl, order, 1); 870 return (m); 871 } 872 } 873 874 /* 875 * The given pool was empty. Find the largest 876 * contiguous, power-of-two-sized set of pages in any 877 * pool. Transfer these pages to the given pool, and 878 * use them to satisfy the allocation. 879 */ 880 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 881 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 882 alt = &vm_phys_free_queues[domain][flind][pind][0]; 883 m = TAILQ_FIRST(&alt[oind].pl); 884 if (m != NULL) { 885 vm_freelist_rem(alt, m, oind); 886 vm_phys_set_pool(pool, m, oind); 887 /* The order [order, oind) queues are empty. */ 888 vm_phys_split_pages(m, oind, fl, order, 1); 889 return (m); 890 } 891 } 892 } 893 return (NULL); 894 } 895 896 /* 897 * Find the vm_page corresponding to the given physical address. 898 */ 899 vm_page_t 900 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 901 { 902 struct vm_phys_seg *seg; 903 int segind; 904 905 for (segind = 0; segind < vm_phys_nsegs; segind++) { 906 seg = &vm_phys_segs[segind]; 907 if (pa >= seg->start && pa < seg->end) 908 return (&seg->first_page[atop(pa - seg->start)]); 909 } 910 return (NULL); 911 } 912 913 vm_page_t 914 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 915 { 916 struct vm_phys_fictitious_seg tmp, *seg; 917 vm_page_t m; 918 919 m = NULL; 920 tmp.start = pa; 921 tmp.end = 0; 922 923 rw_rlock(&vm_phys_fictitious_reg_lock); 924 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 925 rw_runlock(&vm_phys_fictitious_reg_lock); 926 if (seg == NULL) 927 return (NULL); 928 929 m = &seg->first_page[atop(pa - seg->start)]; 930 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 931 932 return (m); 933 } 934 935 static inline void 936 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 937 long page_count, vm_memattr_t memattr) 938 { 939 long i; 940 941 bzero(range, page_count * sizeof(*range)); 942 for (i = 0; i < page_count; i++) { 943 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 944 range[i].oflags &= ~VPO_UNMANAGED; 945 range[i].busy_lock = VPB_UNBUSIED; 946 } 947 } 948 949 int 950 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 951 vm_memattr_t memattr) 952 { 953 struct vm_phys_fictitious_seg *seg; 954 vm_page_t fp; 955 long page_count; 956 #ifdef VM_PHYSSEG_DENSE 957 long pi, pe; 958 long dpage_count; 959 #endif 960 961 KASSERT(start < end, 962 ("Start of segment isn't less than end (start: %jx end: %jx)", 963 (uintmax_t)start, (uintmax_t)end)); 964 965 page_count = (end - start) / PAGE_SIZE; 966 967 #ifdef VM_PHYSSEG_DENSE 968 pi = atop(start); 969 pe = atop(end); 970 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 971 fp = &vm_page_array[pi - first_page]; 972 if ((pe - first_page) > vm_page_array_size) { 973 /* 974 * We have a segment that starts inside 975 * of vm_page_array, but ends outside of it. 976 * 977 * Use vm_page_array pages for those that are 978 * inside of the vm_page_array range, and 979 * allocate the remaining ones. 980 */ 981 dpage_count = vm_page_array_size - (pi - first_page); 982 vm_phys_fictitious_init_range(fp, start, dpage_count, 983 memattr); 984 page_count -= dpage_count; 985 start += ptoa(dpage_count); 986 goto alloc; 987 } 988 /* 989 * We can allocate the full range from vm_page_array, 990 * so there's no need to register the range in the tree. 991 */ 992 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 993 return (0); 994 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 995 /* 996 * We have a segment that ends inside of vm_page_array, 997 * but starts outside of it. 998 */ 999 fp = &vm_page_array[0]; 1000 dpage_count = pe - first_page; 1001 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 1002 memattr); 1003 end -= ptoa(dpage_count); 1004 page_count -= dpage_count; 1005 goto alloc; 1006 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1007 /* 1008 * Trying to register a fictitious range that expands before 1009 * and after vm_page_array. 1010 */ 1011 return (EINVAL); 1012 } else { 1013 alloc: 1014 #endif 1015 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1016 M_WAITOK); 1017 #ifdef VM_PHYSSEG_DENSE 1018 } 1019 #endif 1020 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1021 1022 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1023 seg->start = start; 1024 seg->end = end; 1025 seg->first_page = fp; 1026 1027 rw_wlock(&vm_phys_fictitious_reg_lock); 1028 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1029 rw_wunlock(&vm_phys_fictitious_reg_lock); 1030 1031 return (0); 1032 } 1033 1034 void 1035 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1036 { 1037 struct vm_phys_fictitious_seg *seg, tmp; 1038 #ifdef VM_PHYSSEG_DENSE 1039 long pi, pe; 1040 #endif 1041 1042 KASSERT(start < end, 1043 ("Start of segment isn't less than end (start: %jx end: %jx)", 1044 (uintmax_t)start, (uintmax_t)end)); 1045 1046 #ifdef VM_PHYSSEG_DENSE 1047 pi = atop(start); 1048 pe = atop(end); 1049 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1050 if ((pe - first_page) <= vm_page_array_size) { 1051 /* 1052 * This segment was allocated using vm_page_array 1053 * only, there's nothing to do since those pages 1054 * were never added to the tree. 1055 */ 1056 return; 1057 } 1058 /* 1059 * We have a segment that starts inside 1060 * of vm_page_array, but ends outside of it. 1061 * 1062 * Calculate how many pages were added to the 1063 * tree and free them. 1064 */ 1065 start = ptoa(first_page + vm_page_array_size); 1066 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1067 /* 1068 * We have a segment that ends inside of vm_page_array, 1069 * but starts outside of it. 1070 */ 1071 end = ptoa(first_page); 1072 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1073 /* Since it's not possible to register such a range, panic. */ 1074 panic( 1075 "Unregistering not registered fictitious range [%#jx:%#jx]", 1076 (uintmax_t)start, (uintmax_t)end); 1077 } 1078 #endif 1079 tmp.start = start; 1080 tmp.end = 0; 1081 1082 rw_wlock(&vm_phys_fictitious_reg_lock); 1083 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1084 if (seg->start != start || seg->end != end) { 1085 rw_wunlock(&vm_phys_fictitious_reg_lock); 1086 panic( 1087 "Unregistering not registered fictitious range [%#jx:%#jx]", 1088 (uintmax_t)start, (uintmax_t)end); 1089 } 1090 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1091 rw_wunlock(&vm_phys_fictitious_reg_lock); 1092 free(seg->first_page, M_FICT_PAGES); 1093 free(seg, M_FICT_PAGES); 1094 } 1095 1096 /* 1097 * Free a contiguous, power of two-sized set of physical pages. 1098 * 1099 * The free page queues must be locked. 1100 */ 1101 void 1102 vm_phys_free_pages(vm_page_t m, int order) 1103 { 1104 struct vm_freelist *fl; 1105 struct vm_phys_seg *seg; 1106 vm_paddr_t pa; 1107 vm_page_t m_buddy; 1108 1109 KASSERT(m->order == VM_NFREEORDER, 1110 ("vm_phys_free_pages: page %p has unexpected order %d", 1111 m, m->order)); 1112 KASSERT(m->pool < VM_NFREEPOOL, 1113 ("vm_phys_free_pages: page %p has unexpected pool %d", 1114 m, m->pool)); 1115 KASSERT(order < VM_NFREEORDER, 1116 ("vm_phys_free_pages: order %d is out of range", order)); 1117 seg = &vm_phys_segs[m->segind]; 1118 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1119 if (order < VM_NFREEORDER - 1) { 1120 pa = VM_PAGE_TO_PHYS(m); 1121 do { 1122 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1123 if (pa < seg->start || pa >= seg->end) 1124 break; 1125 m_buddy = &seg->first_page[atop(pa - seg->start)]; 1126 if (m_buddy->order != order) 1127 break; 1128 fl = (*seg->free_queues)[m_buddy->pool]; 1129 vm_freelist_rem(fl, m_buddy, order); 1130 if (m_buddy->pool != m->pool) 1131 vm_phys_set_pool(m->pool, m_buddy, order); 1132 order++; 1133 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1134 m = &seg->first_page[atop(pa - seg->start)]; 1135 } while (order < VM_NFREEORDER - 1); 1136 } 1137 fl = (*seg->free_queues)[m->pool]; 1138 vm_freelist_add(fl, m, order, 1); 1139 } 1140 1141 /* 1142 * Return the largest possible order of a set of pages starting at m. 1143 */ 1144 static int 1145 max_order(vm_page_t m) 1146 { 1147 1148 /* 1149 * Unsigned "min" is used here so that "order" is assigned 1150 * "VM_NFREEORDER - 1" when "m"'s physical address is zero 1151 * or the low-order bits of its physical address are zero 1152 * because the size of a physical address exceeds the size of 1153 * a long. 1154 */ 1155 return (min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, 1156 VM_NFREEORDER - 1)); 1157 } 1158 1159 /* 1160 * Free a contiguous, arbitrarily sized set of physical pages, without 1161 * merging across set boundaries. 1162 * 1163 * The free page queues must be locked. 1164 */ 1165 void 1166 vm_phys_enqueue_contig(vm_page_t m, u_long npages) 1167 { 1168 struct vm_freelist *fl; 1169 struct vm_phys_seg *seg; 1170 vm_page_t m_end; 1171 int order; 1172 1173 /* 1174 * Avoid unnecessary coalescing by freeing the pages in the largest 1175 * possible power-of-two-sized subsets. 1176 */ 1177 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1178 seg = &vm_phys_segs[m->segind]; 1179 fl = (*seg->free_queues)[m->pool]; 1180 m_end = m + npages; 1181 /* Free blocks of increasing size. */ 1182 while ((order = max_order(m)) < VM_NFREEORDER - 1 && 1183 m + (1 << order) <= m_end) { 1184 KASSERT(seg == &vm_phys_segs[m->segind], 1185 ("%s: page range [%p,%p) spans multiple segments", 1186 __func__, m_end - npages, m)); 1187 vm_freelist_add(fl, m, order, 1); 1188 m += 1 << order; 1189 } 1190 /* Free blocks of maximum size. */ 1191 while (m + (1 << order) <= m_end) { 1192 KASSERT(seg == &vm_phys_segs[m->segind], 1193 ("%s: page range [%p,%p) spans multiple segments", 1194 __func__, m_end - npages, m)); 1195 vm_freelist_add(fl, m, order, 1); 1196 m += 1 << order; 1197 } 1198 /* Free blocks of diminishing size. */ 1199 while (m < m_end) { 1200 KASSERT(seg == &vm_phys_segs[m->segind], 1201 ("%s: page range [%p,%p) spans multiple segments", 1202 __func__, m_end - npages, m)); 1203 order = flsl(m_end - m) - 1; 1204 vm_freelist_add(fl, m, order, 1); 1205 m += 1 << order; 1206 } 1207 } 1208 1209 /* 1210 * Free a contiguous, arbitrarily sized set of physical pages. 1211 * 1212 * The free page queues must be locked. 1213 */ 1214 void 1215 vm_phys_free_contig(vm_page_t m, u_long npages) 1216 { 1217 int order_start, order_end; 1218 vm_page_t m_start, m_end; 1219 1220 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1221 1222 m_start = m; 1223 order_start = max_order(m_start); 1224 if (order_start < VM_NFREEORDER - 1) 1225 m_start += 1 << order_start; 1226 m_end = m + npages; 1227 order_end = max_order(m_end); 1228 if (order_end < VM_NFREEORDER - 1) 1229 m_end -= 1 << order_end; 1230 /* 1231 * Avoid unnecessary coalescing by freeing the pages at the start and 1232 * end of the range last. 1233 */ 1234 if (m_start < m_end) 1235 vm_phys_enqueue_contig(m_start, m_end - m_start); 1236 if (order_start < VM_NFREEORDER - 1) 1237 vm_phys_free_pages(m, order_start); 1238 if (order_end < VM_NFREEORDER - 1) 1239 vm_phys_free_pages(m_end, order_end); 1240 } 1241 1242 /* 1243 * Scan physical memory between the specified addresses "low" and "high" for a 1244 * run of contiguous physical pages that satisfy the specified conditions, and 1245 * return the lowest page in the run. The specified "alignment" determines 1246 * the alignment of the lowest physical page in the run. If the specified 1247 * "boundary" is non-zero, then the run of physical pages cannot span a 1248 * physical address that is a multiple of "boundary". 1249 * 1250 * "npages" must be greater than zero. Both "alignment" and "boundary" must 1251 * be a power of two. 1252 */ 1253 vm_page_t 1254 vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1255 u_long alignment, vm_paddr_t boundary, int options) 1256 { 1257 vm_paddr_t pa_end; 1258 vm_page_t m_end, m_run, m_start; 1259 struct vm_phys_seg *seg; 1260 int segind; 1261 1262 KASSERT(npages > 0, ("npages is 0")); 1263 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1264 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1265 if (low >= high) 1266 return (NULL); 1267 for (segind = 0; segind < vm_phys_nsegs; segind++) { 1268 seg = &vm_phys_segs[segind]; 1269 if (seg->domain != domain) 1270 continue; 1271 if (seg->start >= high) 1272 break; 1273 if (low >= seg->end) 1274 continue; 1275 if (low <= seg->start) 1276 m_start = seg->first_page; 1277 else 1278 m_start = &seg->first_page[atop(low - seg->start)]; 1279 if (high < seg->end) 1280 pa_end = high; 1281 else 1282 pa_end = seg->end; 1283 if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) 1284 continue; 1285 m_end = &seg->first_page[atop(pa_end - seg->start)]; 1286 m_run = vm_page_scan_contig(npages, m_start, m_end, 1287 alignment, boundary, options); 1288 if (m_run != NULL) 1289 return (m_run); 1290 } 1291 return (NULL); 1292 } 1293 1294 /* 1295 * Set the pool for a contiguous, power of two-sized set of physical pages. 1296 */ 1297 void 1298 vm_phys_set_pool(int pool, vm_page_t m, int order) 1299 { 1300 vm_page_t m_tmp; 1301 1302 for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 1303 m_tmp->pool = pool; 1304 } 1305 1306 /* 1307 * Search for the given physical page "m" in the free lists. If the search 1308 * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return 1309 * FALSE, indicating that "m" is not in the free lists. 1310 * 1311 * The free page queues must be locked. 1312 */ 1313 boolean_t 1314 vm_phys_unfree_page(vm_page_t m) 1315 { 1316 struct vm_freelist *fl; 1317 struct vm_phys_seg *seg; 1318 vm_paddr_t pa, pa_half; 1319 vm_page_t m_set, m_tmp; 1320 int order; 1321 1322 /* 1323 * First, find the contiguous, power of two-sized set of free 1324 * physical pages containing the given physical page "m" and 1325 * assign it to "m_set". 1326 */ 1327 seg = &vm_phys_segs[m->segind]; 1328 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1329 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1330 order < VM_NFREEORDER - 1; ) { 1331 order++; 1332 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1333 if (pa >= seg->start) 1334 m_set = &seg->first_page[atop(pa - seg->start)]; 1335 else 1336 return (FALSE); 1337 } 1338 if (m_set->order < order) 1339 return (FALSE); 1340 if (m_set->order == VM_NFREEORDER) 1341 return (FALSE); 1342 KASSERT(m_set->order < VM_NFREEORDER, 1343 ("vm_phys_unfree_page: page %p has unexpected order %d", 1344 m_set, m_set->order)); 1345 1346 /* 1347 * Next, remove "m_set" from the free lists. Finally, extract 1348 * "m" from "m_set" using an iterative algorithm: While "m_set" 1349 * is larger than a page, shrink "m_set" by returning the half 1350 * of "m_set" that does not contain "m" to the free lists. 1351 */ 1352 fl = (*seg->free_queues)[m_set->pool]; 1353 order = m_set->order; 1354 vm_freelist_rem(fl, m_set, order); 1355 while (order > 0) { 1356 order--; 1357 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1358 if (m->phys_addr < pa_half) 1359 m_tmp = &seg->first_page[atop(pa_half - seg->start)]; 1360 else { 1361 m_tmp = m_set; 1362 m_set = &seg->first_page[atop(pa_half - seg->start)]; 1363 } 1364 vm_freelist_add(fl, m_tmp, order, 0); 1365 } 1366 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1367 return (TRUE); 1368 } 1369 1370 /* 1371 * Allocate a contiguous set of physical pages of the given size 1372 * "npages" from the free lists. All of the physical pages must be at 1373 * or above the given physical address "low" and below the given 1374 * physical address "high". The given value "alignment" determines the 1375 * alignment of the first physical page in the set. If the given value 1376 * "boundary" is non-zero, then the set of physical pages cannot cross 1377 * any physical address boundary that is a multiple of that value. Both 1378 * "alignment" and "boundary" must be a power of two. 1379 */ 1380 vm_page_t 1381 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1382 u_long alignment, vm_paddr_t boundary) 1383 { 1384 vm_paddr_t pa_end, pa_start; 1385 vm_page_t m_run; 1386 struct vm_phys_seg *seg; 1387 int segind; 1388 1389 KASSERT(npages > 0, ("npages is 0")); 1390 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1391 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1392 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1393 if (low >= high) 1394 return (NULL); 1395 m_run = NULL; 1396 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1397 seg = &vm_phys_segs[segind]; 1398 if (seg->start >= high || seg->domain != domain) 1399 continue; 1400 if (low >= seg->end) 1401 break; 1402 if (low <= seg->start) 1403 pa_start = seg->start; 1404 else 1405 pa_start = low; 1406 if (high < seg->end) 1407 pa_end = high; 1408 else 1409 pa_end = seg->end; 1410 if (pa_end - pa_start < ptoa(npages)) 1411 continue; 1412 m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, 1413 alignment, boundary); 1414 if (m_run != NULL) 1415 break; 1416 } 1417 return (m_run); 1418 } 1419 1420 /* 1421 * Allocate a run of contiguous physical pages from the free list for the 1422 * specified segment. 1423 */ 1424 static vm_page_t 1425 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, 1426 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1427 { 1428 struct vm_freelist *fl; 1429 vm_paddr_t pa, pa_end, size; 1430 vm_page_t m, m_ret; 1431 u_long npages_end; 1432 int oind, order, pind; 1433 1434 KASSERT(npages > 0, ("npages is 0")); 1435 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1436 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1437 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1438 /* Compute the queue that is the best fit for npages. */ 1439 order = flsl(npages - 1); 1440 /* Search for a run satisfying the specified conditions. */ 1441 size = npages << PAGE_SHIFT; 1442 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; 1443 oind++) { 1444 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1445 fl = (*seg->free_queues)[pind]; 1446 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1447 /* 1448 * Is the size of this allocation request 1449 * larger than the largest block size? 1450 */ 1451 if (order >= VM_NFREEORDER) { 1452 /* 1453 * Determine if a sufficient number of 1454 * subsequent blocks to satisfy the 1455 * allocation request are free. 1456 */ 1457 pa = VM_PAGE_TO_PHYS(m_ret); 1458 pa_end = pa + size; 1459 if (pa_end < pa) 1460 continue; 1461 for (;;) { 1462 pa += 1 << (PAGE_SHIFT + 1463 VM_NFREEORDER - 1); 1464 if (pa >= pa_end || 1465 pa < seg->start || 1466 pa >= seg->end) 1467 break; 1468 m = &seg->first_page[atop(pa - 1469 seg->start)]; 1470 if (m->order != VM_NFREEORDER - 1471 1) 1472 break; 1473 } 1474 /* If not, go to the next block. */ 1475 if (pa < pa_end) 1476 continue; 1477 } 1478 1479 /* 1480 * Determine if the blocks are within the 1481 * given range, satisfy the given alignment, 1482 * and do not cross the given boundary. 1483 */ 1484 pa = VM_PAGE_TO_PHYS(m_ret); 1485 pa_end = pa + size; 1486 if (pa >= low && pa_end <= high && 1487 (pa & (alignment - 1)) == 0 && 1488 rounddown2(pa ^ (pa_end - 1), boundary) == 0) 1489 goto done; 1490 } 1491 } 1492 } 1493 return (NULL); 1494 done: 1495 for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { 1496 fl = (*seg->free_queues)[m->pool]; 1497 vm_freelist_rem(fl, m, oind); 1498 if (m->pool != VM_FREEPOOL_DEFAULT) 1499 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1500 } 1501 /* Return excess pages to the free lists. */ 1502 npages_end = roundup2(npages, 1 << oind); 1503 if (npages < npages_end) { 1504 fl = (*seg->free_queues)[VM_FREEPOOL_DEFAULT]; 1505 vm_phys_enq_range(&m_ret[npages], npages_end - npages, fl, 0); 1506 } 1507 return (m_ret); 1508 } 1509 1510 /* 1511 * Return the index of the first unused slot which may be the terminating 1512 * entry. 1513 */ 1514 static int 1515 vm_phys_avail_count(void) 1516 { 1517 int i; 1518 1519 for (i = 0; phys_avail[i + 1]; i += 2) 1520 continue; 1521 if (i > PHYS_AVAIL_ENTRIES) 1522 panic("Improperly terminated phys_avail %d entries", i); 1523 1524 return (i); 1525 } 1526 1527 /* 1528 * Assert that a phys_avail entry is valid. 1529 */ 1530 static void 1531 vm_phys_avail_check(int i) 1532 { 1533 if (phys_avail[i] & PAGE_MASK) 1534 panic("Unaligned phys_avail[%d]: %#jx", i, 1535 (intmax_t)phys_avail[i]); 1536 if (phys_avail[i+1] & PAGE_MASK) 1537 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1538 (intmax_t)phys_avail[i]); 1539 if (phys_avail[i + 1] < phys_avail[i]) 1540 panic("phys_avail[%d] start %#jx < end %#jx", i, 1541 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1542 } 1543 1544 /* 1545 * Return the index of an overlapping phys_avail entry or -1. 1546 */ 1547 #ifdef NUMA 1548 static int 1549 vm_phys_avail_find(vm_paddr_t pa) 1550 { 1551 int i; 1552 1553 for (i = 0; phys_avail[i + 1]; i += 2) 1554 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1555 return (i); 1556 return (-1); 1557 } 1558 #endif 1559 1560 /* 1561 * Return the index of the largest entry. 1562 */ 1563 int 1564 vm_phys_avail_largest(void) 1565 { 1566 vm_paddr_t sz, largesz; 1567 int largest; 1568 int i; 1569 1570 largest = 0; 1571 largesz = 0; 1572 for (i = 0; phys_avail[i + 1]; i += 2) { 1573 sz = vm_phys_avail_size(i); 1574 if (sz > largesz) { 1575 largesz = sz; 1576 largest = i; 1577 } 1578 } 1579 1580 return (largest); 1581 } 1582 1583 vm_paddr_t 1584 vm_phys_avail_size(int i) 1585 { 1586 1587 return (phys_avail[i + 1] - phys_avail[i]); 1588 } 1589 1590 /* 1591 * Split an entry at the address 'pa'. Return zero on success or errno. 1592 */ 1593 static int 1594 vm_phys_avail_split(vm_paddr_t pa, int i) 1595 { 1596 int cnt; 1597 1598 vm_phys_avail_check(i); 1599 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1600 panic("vm_phys_avail_split: invalid address"); 1601 cnt = vm_phys_avail_count(); 1602 if (cnt >= PHYS_AVAIL_ENTRIES) 1603 return (ENOSPC); 1604 memmove(&phys_avail[i + 2], &phys_avail[i], 1605 (cnt - i) * sizeof(phys_avail[0])); 1606 phys_avail[i + 1] = pa; 1607 phys_avail[i + 2] = pa; 1608 vm_phys_avail_check(i); 1609 vm_phys_avail_check(i+2); 1610 1611 return (0); 1612 } 1613 1614 /* 1615 * This routine allocates NUMA node specific memory before the page 1616 * allocator is bootstrapped. 1617 */ 1618 vm_paddr_t 1619 vm_phys_early_alloc(int domain, size_t alloc_size) 1620 { 1621 int i, mem_index, biggestone; 1622 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1623 1624 1625 /* 1626 * Search the mem_affinity array for the biggest address 1627 * range in the desired domain. This is used to constrain 1628 * the phys_avail selection below. 1629 */ 1630 biggestsize = 0; 1631 mem_index = 0; 1632 mem_start = 0; 1633 mem_end = -1; 1634 #ifdef NUMA 1635 if (mem_affinity != NULL) { 1636 for (i = 0; ; i++) { 1637 size = mem_affinity[i].end - mem_affinity[i].start; 1638 if (size == 0) 1639 break; 1640 if (mem_affinity[i].domain != domain) 1641 continue; 1642 if (size > biggestsize) { 1643 mem_index = i; 1644 biggestsize = size; 1645 } 1646 } 1647 mem_start = mem_affinity[mem_index].start; 1648 mem_end = mem_affinity[mem_index].end; 1649 } 1650 #endif 1651 1652 /* 1653 * Now find biggest physical segment in within the desired 1654 * numa domain. 1655 */ 1656 biggestsize = 0; 1657 biggestone = 0; 1658 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1659 /* skip regions that are out of range */ 1660 if (phys_avail[i+1] - alloc_size < mem_start || 1661 phys_avail[i+1] > mem_end) 1662 continue; 1663 size = vm_phys_avail_size(i); 1664 if (size > biggestsize) { 1665 biggestone = i; 1666 biggestsize = size; 1667 } 1668 } 1669 alloc_size = round_page(alloc_size); 1670 1671 /* 1672 * Grab single pages from the front to reduce fragmentation. 1673 */ 1674 if (alloc_size == PAGE_SIZE) { 1675 pa = phys_avail[biggestone]; 1676 phys_avail[biggestone] += PAGE_SIZE; 1677 vm_phys_avail_check(biggestone); 1678 return (pa); 1679 } 1680 1681 /* 1682 * Naturally align large allocations. 1683 */ 1684 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1685 if (alloc_size + align > biggestsize) 1686 panic("cannot find a large enough size\n"); 1687 if (align != 0 && 1688 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1689 biggestone) != 0) 1690 /* Wasting memory. */ 1691 phys_avail[biggestone + 1] -= align; 1692 1693 phys_avail[biggestone + 1] -= alloc_size; 1694 vm_phys_avail_check(biggestone); 1695 pa = phys_avail[biggestone + 1]; 1696 return (pa); 1697 } 1698 1699 void 1700 vm_phys_early_startup(void) 1701 { 1702 int i; 1703 1704 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1705 phys_avail[i] = round_page(phys_avail[i]); 1706 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1707 } 1708 1709 #ifdef NUMA 1710 /* Force phys_avail to be split by domain. */ 1711 if (mem_affinity != NULL) { 1712 int idx; 1713 1714 for (i = 0; mem_affinity[i].end != 0; i++) { 1715 idx = vm_phys_avail_find(mem_affinity[i].start); 1716 if (idx != -1 && 1717 phys_avail[idx] != mem_affinity[i].start) 1718 vm_phys_avail_split(mem_affinity[i].start, idx); 1719 idx = vm_phys_avail_find(mem_affinity[i].end); 1720 if (idx != -1 && 1721 phys_avail[idx] != mem_affinity[i].end) 1722 vm_phys_avail_split(mem_affinity[i].end, idx); 1723 } 1724 } 1725 #endif 1726 } 1727 1728 #ifdef DDB 1729 /* 1730 * Show the number of physical pages in each of the free lists. 1731 */ 1732 DB_SHOW_COMMAND(freepages, db_show_freepages) 1733 { 1734 struct vm_freelist *fl; 1735 int flind, oind, pind, dom; 1736 1737 for (dom = 0; dom < vm_ndomains; dom++) { 1738 db_printf("DOMAIN: %d\n", dom); 1739 for (flind = 0; flind < vm_nfreelists; flind++) { 1740 db_printf("FREE LIST %d:\n" 1741 "\n ORDER (SIZE) | NUMBER" 1742 "\n ", flind); 1743 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1744 db_printf(" | POOL %d", pind); 1745 db_printf("\n-- "); 1746 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1747 db_printf("-- -- "); 1748 db_printf("--\n"); 1749 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 1750 db_printf(" %2.2d (%6.6dK)", oind, 1751 1 << (PAGE_SHIFT - 10 + oind)); 1752 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1753 fl = vm_phys_free_queues[dom][flind][pind]; 1754 db_printf(" | %6.6d", fl[oind].lcnt); 1755 } 1756 db_printf("\n"); 1757 } 1758 db_printf("\n"); 1759 } 1760 db_printf("\n"); 1761 } 1762 } 1763 #endif 1764