1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_vm.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/domainset.h> 48 #include <sys/lock.h> 49 #include <sys/kernel.h> 50 #include <sys/malloc.h> 51 #include <sys/mutex.h> 52 #include <sys/proc.h> 53 #include <sys/queue.h> 54 #include <sys/rwlock.h> 55 #include <sys/sbuf.h> 56 #include <sys/sysctl.h> 57 #include <sys/tree.h> 58 #include <sys/vmmeter.h> 59 60 #include <ddb/ddb.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_extern.h> 64 #include <vm/vm_param.h> 65 #include <vm/vm_kern.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_phys.h> 69 #include <vm/vm_pagequeue.h> 70 71 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 72 "Too many physsegs."); 73 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t), 74 "vm_paddr_t too big for ffsll, flsll."); 75 76 #ifdef NUMA 77 struct mem_affinity __read_mostly *mem_affinity; 78 int __read_mostly *mem_locality; 79 #endif 80 81 int __read_mostly vm_ndomains = 1; 82 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 83 84 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 85 int __read_mostly vm_phys_nsegs; 86 static struct vm_phys_seg vm_phys_early_segs[8]; 87 static int vm_phys_early_nsegs; 88 89 struct vm_phys_fictitious_seg; 90 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 91 struct vm_phys_fictitious_seg *); 92 93 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 94 RB_INITIALIZER(&vm_phys_fictitious_tree); 95 96 struct vm_phys_fictitious_seg { 97 RB_ENTRY(vm_phys_fictitious_seg) node; 98 /* Memory region data */ 99 vm_paddr_t start; 100 vm_paddr_t end; 101 vm_page_t first_page; 102 }; 103 104 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 105 vm_phys_fictitious_cmp); 106 107 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 108 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 109 110 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 111 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 112 [VM_NFREEORDER_MAX]; 113 114 static int __read_mostly vm_nfreelists; 115 116 /* 117 * These "avail lists" are globals used to communicate boot-time physical 118 * memory layout to other parts of the kernel. Each physically contiguous 119 * region of memory is defined by a start address at an even index and an 120 * end address at the following odd index. Each list is terminated by a 121 * pair of zero entries. 122 * 123 * dump_avail tells the dump code what regions to include in a crash dump, and 124 * phys_avail is all of the remaining physical memory that is available for 125 * the vm system. 126 * 127 * Initially dump_avail and phys_avail are identical. Boot time memory 128 * allocations remove extents from phys_avail that may still be included 129 * in dumps. 130 */ 131 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 132 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 133 134 /* 135 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 136 */ 137 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 138 139 CTASSERT(VM_FREELIST_DEFAULT == 0); 140 141 #ifdef VM_FREELIST_DMA32 142 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 143 #endif 144 145 /* 146 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 147 * the ordering of the free list boundaries. 148 */ 149 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 150 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 151 #endif 152 153 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 154 SYSCTL_OID(_vm, OID_AUTO, phys_free, 155 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 156 sysctl_vm_phys_free, "A", 157 "Phys Free Info"); 158 159 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 160 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 161 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 162 sysctl_vm_phys_segs, "A", 163 "Phys Seg Info"); 164 165 #ifdef NUMA 166 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 167 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 168 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 169 sysctl_vm_phys_locality, "A", 170 "Phys Locality Info"); 171 #endif 172 173 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 174 &vm_ndomains, 0, "Number of physical memory domains available."); 175 176 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 177 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 178 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 179 int order, int tail); 180 181 /* 182 * Red-black tree helpers for vm fictitious range management. 183 */ 184 static inline int 185 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 186 struct vm_phys_fictitious_seg *range) 187 { 188 189 KASSERT(range->start != 0 && range->end != 0, 190 ("Invalid range passed on search for vm_fictitious page")); 191 if (p->start >= range->end) 192 return (1); 193 if (p->start < range->start) 194 return (-1); 195 196 return (0); 197 } 198 199 static int 200 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 201 struct vm_phys_fictitious_seg *p2) 202 { 203 204 /* Check if this is a search for a page */ 205 if (p1->end == 0) 206 return (vm_phys_fictitious_in_range(p1, p2)); 207 208 KASSERT(p2->end != 0, 209 ("Invalid range passed as second parameter to vm fictitious comparison")); 210 211 /* Searching to add a new range */ 212 if (p1->end <= p2->start) 213 return (-1); 214 if (p1->start >= p2->end) 215 return (1); 216 217 panic("Trying to add overlapping vm fictitious ranges:\n" 218 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 219 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 220 } 221 222 int 223 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high) 224 { 225 #ifdef NUMA 226 domainset_t mask; 227 int i; 228 229 if (vm_ndomains == 1 || mem_affinity == NULL) 230 return (0); 231 232 DOMAINSET_ZERO(&mask); 233 /* 234 * Check for any memory that overlaps low, high. 235 */ 236 for (i = 0; mem_affinity[i].end != 0; i++) 237 if (mem_affinity[i].start <= high && 238 mem_affinity[i].end >= low) 239 DOMAINSET_SET(mem_affinity[i].domain, &mask); 240 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 241 return (prefer); 242 if (DOMAINSET_EMPTY(&mask)) 243 panic("vm_phys_domain_match: Impossible constraint"); 244 return (DOMAINSET_FFS(&mask) - 1); 245 #else 246 return (0); 247 #endif 248 } 249 250 /* 251 * Outputs the state of the physical memory allocator, specifically, 252 * the amount of physical memory in each free list. 253 */ 254 static int 255 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 256 { 257 struct sbuf sbuf; 258 struct vm_freelist *fl; 259 int dom, error, flind, oind, pind; 260 261 error = sysctl_wire_old_buffer(req, 0); 262 if (error != 0) 263 return (error); 264 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 265 for (dom = 0; dom < vm_ndomains; dom++) { 266 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 267 for (flind = 0; flind < vm_nfreelists; flind++) { 268 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 269 "\n ORDER (SIZE) | NUMBER" 270 "\n ", flind); 271 for (pind = 0; pind < VM_NFREEPOOL; pind++) 272 sbuf_printf(&sbuf, " | POOL %d", pind); 273 sbuf_printf(&sbuf, "\n-- "); 274 for (pind = 0; pind < VM_NFREEPOOL; pind++) 275 sbuf_printf(&sbuf, "-- -- "); 276 sbuf_printf(&sbuf, "--\n"); 277 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 278 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 279 1 << (PAGE_SHIFT - 10 + oind)); 280 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 281 fl = vm_phys_free_queues[dom][flind][pind]; 282 sbuf_printf(&sbuf, " | %6d", 283 fl[oind].lcnt); 284 } 285 sbuf_printf(&sbuf, "\n"); 286 } 287 } 288 } 289 error = sbuf_finish(&sbuf); 290 sbuf_delete(&sbuf); 291 return (error); 292 } 293 294 /* 295 * Outputs the set of physical memory segments. 296 */ 297 static int 298 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 299 { 300 struct sbuf sbuf; 301 struct vm_phys_seg *seg; 302 int error, segind; 303 304 error = sysctl_wire_old_buffer(req, 0); 305 if (error != 0) 306 return (error); 307 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 308 for (segind = 0; segind < vm_phys_nsegs; segind++) { 309 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 310 seg = &vm_phys_segs[segind]; 311 sbuf_printf(&sbuf, "start: %#jx\n", 312 (uintmax_t)seg->start); 313 sbuf_printf(&sbuf, "end: %#jx\n", 314 (uintmax_t)seg->end); 315 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 316 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 317 } 318 error = sbuf_finish(&sbuf); 319 sbuf_delete(&sbuf); 320 return (error); 321 } 322 323 /* 324 * Return affinity, or -1 if there's no affinity information. 325 */ 326 int 327 vm_phys_mem_affinity(int f, int t) 328 { 329 330 #ifdef NUMA 331 if (mem_locality == NULL) 332 return (-1); 333 if (f >= vm_ndomains || t >= vm_ndomains) 334 return (-1); 335 return (mem_locality[f * vm_ndomains + t]); 336 #else 337 return (-1); 338 #endif 339 } 340 341 #ifdef NUMA 342 /* 343 * Outputs the VM locality table. 344 */ 345 static int 346 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 347 { 348 struct sbuf sbuf; 349 int error, i, j; 350 351 error = sysctl_wire_old_buffer(req, 0); 352 if (error != 0) 353 return (error); 354 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 355 356 sbuf_printf(&sbuf, "\n"); 357 358 for (i = 0; i < vm_ndomains; i++) { 359 sbuf_printf(&sbuf, "%d: ", i); 360 for (j = 0; j < vm_ndomains; j++) { 361 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 362 } 363 sbuf_printf(&sbuf, "\n"); 364 } 365 error = sbuf_finish(&sbuf); 366 sbuf_delete(&sbuf); 367 return (error); 368 } 369 #endif 370 371 static void 372 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 373 { 374 375 m->order = order; 376 if (tail) 377 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 378 else 379 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 380 fl[order].lcnt++; 381 } 382 383 static void 384 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 385 { 386 387 TAILQ_REMOVE(&fl[order].pl, m, listq); 388 fl[order].lcnt--; 389 m->order = VM_NFREEORDER; 390 } 391 392 /* 393 * Create a physical memory segment. 394 */ 395 static void 396 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 397 { 398 struct vm_phys_seg *seg; 399 400 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 401 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 402 KASSERT(domain >= 0 && domain < vm_ndomains, 403 ("vm_phys_create_seg: invalid domain provided")); 404 seg = &vm_phys_segs[vm_phys_nsegs++]; 405 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 406 *seg = *(seg - 1); 407 seg--; 408 } 409 seg->start = start; 410 seg->end = end; 411 seg->domain = domain; 412 } 413 414 static void 415 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 416 { 417 #ifdef NUMA 418 int i; 419 420 if (mem_affinity == NULL) { 421 _vm_phys_create_seg(start, end, 0); 422 return; 423 } 424 425 for (i = 0;; i++) { 426 if (mem_affinity[i].end == 0) 427 panic("Reached end of affinity info"); 428 if (mem_affinity[i].end <= start) 429 continue; 430 if (mem_affinity[i].start > start) 431 panic("No affinity info for start %jx", 432 (uintmax_t)start); 433 if (mem_affinity[i].end >= end) { 434 _vm_phys_create_seg(start, end, 435 mem_affinity[i].domain); 436 break; 437 } 438 _vm_phys_create_seg(start, mem_affinity[i].end, 439 mem_affinity[i].domain); 440 start = mem_affinity[i].end; 441 } 442 #else 443 _vm_phys_create_seg(start, end, 0); 444 #endif 445 } 446 447 /* 448 * Add a physical memory segment. 449 */ 450 void 451 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 452 { 453 vm_paddr_t paddr; 454 455 KASSERT((start & PAGE_MASK) == 0, 456 ("vm_phys_define_seg: start is not page aligned")); 457 KASSERT((end & PAGE_MASK) == 0, 458 ("vm_phys_define_seg: end is not page aligned")); 459 460 /* 461 * Split the physical memory segment if it spans two or more free 462 * list boundaries. 463 */ 464 paddr = start; 465 #ifdef VM_FREELIST_LOWMEM 466 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 467 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 468 paddr = VM_LOWMEM_BOUNDARY; 469 } 470 #endif 471 #ifdef VM_FREELIST_DMA32 472 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 473 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 474 paddr = VM_DMA32_BOUNDARY; 475 } 476 #endif 477 vm_phys_create_seg(paddr, end); 478 } 479 480 /* 481 * Initialize the physical memory allocator. 482 * 483 * Requires that vm_page_array is initialized! 484 */ 485 void 486 vm_phys_init(void) 487 { 488 struct vm_freelist *fl; 489 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 490 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) 491 u_long npages; 492 #endif 493 int dom, flind, freelist, oind, pind, segind; 494 495 /* 496 * Compute the number of free lists, and generate the mapping from the 497 * manifest constants VM_FREELIST_* to the free list indices. 498 * 499 * Initially, the entries of vm_freelist_to_flind[] are set to either 500 * 0 or 1 to indicate which free lists should be created. 501 */ 502 #ifdef VM_DMA32_NPAGES_THRESHOLD 503 npages = 0; 504 #endif 505 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 506 seg = &vm_phys_segs[segind]; 507 #ifdef VM_FREELIST_LOWMEM 508 if (seg->end <= VM_LOWMEM_BOUNDARY) 509 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 510 else 511 #endif 512 #ifdef VM_FREELIST_DMA32 513 if ( 514 #ifdef VM_DMA32_NPAGES_THRESHOLD 515 /* 516 * Create the DMA32 free list only if the amount of 517 * physical memory above physical address 4G exceeds the 518 * given threshold. 519 */ 520 npages > VM_DMA32_NPAGES_THRESHOLD && 521 #endif 522 seg->end <= VM_DMA32_BOUNDARY) 523 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 524 else 525 #endif 526 { 527 #ifdef VM_DMA32_NPAGES_THRESHOLD 528 npages += atop(seg->end - seg->start); 529 #endif 530 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 531 } 532 } 533 /* Change each entry into a running total of the free lists. */ 534 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 535 vm_freelist_to_flind[freelist] += 536 vm_freelist_to_flind[freelist - 1]; 537 } 538 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 539 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 540 /* Change each entry into a free list index. */ 541 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 542 vm_freelist_to_flind[freelist]--; 543 544 /* 545 * Initialize the first_page and free_queues fields of each physical 546 * memory segment. 547 */ 548 #ifdef VM_PHYSSEG_SPARSE 549 npages = 0; 550 #endif 551 for (segind = 0; segind < vm_phys_nsegs; segind++) { 552 seg = &vm_phys_segs[segind]; 553 #ifdef VM_PHYSSEG_SPARSE 554 seg->first_page = &vm_page_array[npages]; 555 npages += atop(seg->end - seg->start); 556 #else 557 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 558 #endif 559 #ifdef VM_FREELIST_LOWMEM 560 if (seg->end <= VM_LOWMEM_BOUNDARY) { 561 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 562 KASSERT(flind >= 0, 563 ("vm_phys_init: LOWMEM flind < 0")); 564 } else 565 #endif 566 #ifdef VM_FREELIST_DMA32 567 if (seg->end <= VM_DMA32_BOUNDARY) { 568 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 569 KASSERT(flind >= 0, 570 ("vm_phys_init: DMA32 flind < 0")); 571 } else 572 #endif 573 { 574 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 575 KASSERT(flind >= 0, 576 ("vm_phys_init: DEFAULT flind < 0")); 577 } 578 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 579 } 580 581 /* 582 * Coalesce physical memory segments that are contiguous and share the 583 * same per-domain free queues. 584 */ 585 prev_seg = vm_phys_segs; 586 seg = &vm_phys_segs[1]; 587 end_seg = &vm_phys_segs[vm_phys_nsegs]; 588 while (seg < end_seg) { 589 if (prev_seg->end == seg->start && 590 prev_seg->free_queues == seg->free_queues) { 591 prev_seg->end = seg->end; 592 KASSERT(prev_seg->domain == seg->domain, 593 ("vm_phys_init: free queues cannot span domains")); 594 vm_phys_nsegs--; 595 end_seg--; 596 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 597 *tmp_seg = *(tmp_seg + 1); 598 } else { 599 prev_seg = seg; 600 seg++; 601 } 602 } 603 604 /* 605 * Initialize the free queues. 606 */ 607 for (dom = 0; dom < vm_ndomains; dom++) { 608 for (flind = 0; flind < vm_nfreelists; flind++) { 609 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 610 fl = vm_phys_free_queues[dom][flind][pind]; 611 for (oind = 0; oind < VM_NFREEORDER; oind++) 612 TAILQ_INIT(&fl[oind].pl); 613 } 614 } 615 } 616 617 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 618 } 619 620 /* 621 * Register info about the NUMA topology of the system. 622 * 623 * Invoked by platform-dependent code prior to vm_phys_init(). 624 */ 625 void 626 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, 627 int *locality) 628 { 629 #ifdef NUMA 630 int d, i; 631 632 /* 633 * For now the only override value that we support is 1, which 634 * effectively disables NUMA-awareness in the allocators. 635 */ 636 d = 0; 637 TUNABLE_INT_FETCH("vm.numa.disabled", &d); 638 if (d) 639 ndomains = 1; 640 641 if (ndomains > 1) { 642 vm_ndomains = ndomains; 643 mem_affinity = affinity; 644 mem_locality = locality; 645 } 646 647 for (i = 0; i < vm_ndomains; i++) 648 DOMAINSET_SET(i, &all_domains); 649 #else 650 (void)ndomains; 651 (void)affinity; 652 (void)locality; 653 #endif 654 } 655 656 /* 657 * Split a contiguous, power of two-sized set of physical pages. 658 * 659 * When this function is called by a page allocation function, the caller 660 * should request insertion at the head unless the order [order, oind) queues 661 * are known to be empty. The objective being to reduce the likelihood of 662 * long-term fragmentation by promoting contemporaneous allocation and 663 * (hopefully) deallocation. 664 */ 665 static __inline void 666 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 667 int tail) 668 { 669 vm_page_t m_buddy; 670 671 while (oind > order) { 672 oind--; 673 m_buddy = &m[1 << oind]; 674 KASSERT(m_buddy->order == VM_NFREEORDER, 675 ("vm_phys_split_pages: page %p has unexpected order %d", 676 m_buddy, m_buddy->order)); 677 vm_freelist_add(fl, m_buddy, oind, tail); 678 } 679 } 680 681 /* 682 * Add the physical pages [m, m + npages) at the beginning of a power-of-two 683 * aligned and sized set to the specified free list. 684 * 685 * When this function is called by a page allocation function, the caller 686 * should request insertion at the head unless the lower-order queues are 687 * known to be empty. The objective being to reduce the likelihood of long- 688 * term fragmentation by promoting contemporaneous allocation and (hopefully) 689 * deallocation. 690 * 691 * The physical page m's buddy must not be free. 692 */ 693 static void 694 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 695 { 696 int order; 697 698 KASSERT(npages == 0 || 699 (VM_PAGE_TO_PHYS(m) & 700 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 701 ("%s: page %p and npages %u are misaligned", 702 __func__, m, npages)); 703 while (npages > 0) { 704 KASSERT(m->order == VM_NFREEORDER, 705 ("%s: page %p has unexpected order %d", 706 __func__, m, m->order)); 707 order = fls(npages) - 1; 708 KASSERT(order < VM_NFREEORDER, 709 ("%s: order %d is out of range", __func__, order)); 710 vm_freelist_add(fl, m, order, tail); 711 m += 1 << order; 712 npages -= 1 << order; 713 } 714 } 715 716 /* 717 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 718 * and sized set to the specified free list. 719 * 720 * When this function is called by a page allocation function, the caller 721 * should request insertion at the head unless the lower-order queues are 722 * known to be empty. The objective being to reduce the likelihood of long- 723 * term fragmentation by promoting contemporaneous allocation and (hopefully) 724 * deallocation. 725 * 726 * If npages is zero, this function does nothing and ignores the physical page 727 * parameter m. Otherwise, the physical page m's buddy must not be free. 728 */ 729 static vm_page_t 730 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 731 { 732 int order; 733 734 KASSERT(npages == 0 || 735 ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 736 ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, 737 ("vm_phys_enq_range: page %p and npages %u are misaligned", 738 m, npages)); 739 while (npages > 0) { 740 KASSERT(m->order == VM_NFREEORDER, 741 ("vm_phys_enq_range: page %p has unexpected order %d", 742 m, m->order)); 743 order = ffs(npages) - 1; 744 KASSERT(order < VM_NFREEORDER, 745 ("vm_phys_enq_range: order %d is out of range", order)); 746 vm_freelist_add(fl, m, order, tail); 747 m += 1 << order; 748 npages -= 1 << order; 749 } 750 return (m); 751 } 752 753 /* 754 * Set the pool for a contiguous, power of two-sized set of physical pages. 755 */ 756 static void 757 vm_phys_set_pool(int pool, vm_page_t m, int order) 758 { 759 vm_page_t m_tmp; 760 761 for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 762 m_tmp->pool = pool; 763 } 764 765 /* 766 * Tries to allocate the specified number of pages from the specified pool 767 * within the specified domain. Returns the actual number of allocated pages 768 * and a pointer to each page through the array ma[]. 769 * 770 * The returned pages may not be physically contiguous. However, in contrast 771 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 772 * calling this function once to allocate the desired number of pages will 773 * avoid wasted time in vm_phys_split_pages(). 774 * 775 * The free page queues for the specified domain must be locked. 776 */ 777 int 778 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 779 { 780 struct vm_freelist *alt, *fl; 781 vm_page_t m; 782 int avail, end, flind, freelist, i, oind, pind; 783 784 KASSERT(domain >= 0 && domain < vm_ndomains, 785 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 786 KASSERT(pool < VM_NFREEPOOL, 787 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 788 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 789 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 790 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 791 i = 0; 792 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 793 flind = vm_freelist_to_flind[freelist]; 794 if (flind < 0) 795 continue; 796 fl = vm_phys_free_queues[domain][flind][pool]; 797 for (oind = 0; oind < VM_NFREEORDER; oind++) { 798 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 799 vm_freelist_rem(fl, m, oind); 800 avail = i + (1 << oind); 801 end = imin(npages, avail); 802 while (i < end) 803 ma[i++] = m++; 804 if (i == npages) { 805 /* 806 * Return excess pages to fl. Its order 807 * [0, oind) queues are empty. 808 */ 809 vm_phys_enq_range(m, avail - i, fl, 1); 810 return (npages); 811 } 812 } 813 } 814 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 815 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 816 alt = vm_phys_free_queues[domain][flind][pind]; 817 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 818 NULL) { 819 vm_freelist_rem(alt, m, oind); 820 vm_phys_set_pool(pool, m, oind); 821 avail = i + (1 << oind); 822 end = imin(npages, avail); 823 while (i < end) 824 ma[i++] = m++; 825 if (i == npages) { 826 /* 827 * Return excess pages to fl. 828 * Its order [0, oind) queues 829 * are empty. 830 */ 831 vm_phys_enq_range(m, avail - i, 832 fl, 1); 833 return (npages); 834 } 835 } 836 } 837 } 838 } 839 return (i); 840 } 841 842 /* 843 * Allocate a contiguous, power of two-sized set of physical pages 844 * from the free lists. 845 * 846 * The free page queues must be locked. 847 */ 848 vm_page_t 849 vm_phys_alloc_pages(int domain, int pool, int order) 850 { 851 vm_page_t m; 852 int freelist; 853 854 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 855 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 856 if (m != NULL) 857 return (m); 858 } 859 return (NULL); 860 } 861 862 /* 863 * Allocate a contiguous, power of two-sized set of physical pages from the 864 * specified free list. The free list must be specified using one of the 865 * manifest constants VM_FREELIST_*. 866 * 867 * The free page queues must be locked. 868 */ 869 vm_page_t 870 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 871 { 872 struct vm_freelist *alt, *fl; 873 vm_page_t m; 874 int oind, pind, flind; 875 876 KASSERT(domain >= 0 && domain < vm_ndomains, 877 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 878 domain)); 879 KASSERT(freelist < VM_NFREELIST, 880 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 881 freelist)); 882 KASSERT(pool < VM_NFREEPOOL, 883 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 884 KASSERT(order < VM_NFREEORDER, 885 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 886 887 flind = vm_freelist_to_flind[freelist]; 888 /* Check if freelist is present */ 889 if (flind < 0) 890 return (NULL); 891 892 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 893 fl = &vm_phys_free_queues[domain][flind][pool][0]; 894 for (oind = order; oind < VM_NFREEORDER; oind++) { 895 m = TAILQ_FIRST(&fl[oind].pl); 896 if (m != NULL) { 897 vm_freelist_rem(fl, m, oind); 898 /* The order [order, oind) queues are empty. */ 899 vm_phys_split_pages(m, oind, fl, order, 1); 900 return (m); 901 } 902 } 903 904 /* 905 * The given pool was empty. Find the largest 906 * contiguous, power-of-two-sized set of pages in any 907 * pool. Transfer these pages to the given pool, and 908 * use them to satisfy the allocation. 909 */ 910 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 911 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 912 alt = &vm_phys_free_queues[domain][flind][pind][0]; 913 m = TAILQ_FIRST(&alt[oind].pl); 914 if (m != NULL) { 915 vm_freelist_rem(alt, m, oind); 916 vm_phys_set_pool(pool, m, oind); 917 /* The order [order, oind) queues are empty. */ 918 vm_phys_split_pages(m, oind, fl, order, 1); 919 return (m); 920 } 921 } 922 } 923 return (NULL); 924 } 925 926 /* 927 * Find the vm_page corresponding to the given physical address. 928 */ 929 vm_page_t 930 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 931 { 932 struct vm_phys_seg *seg; 933 934 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 935 return (&seg->first_page[atop(pa - seg->start)]); 936 return (NULL); 937 } 938 939 vm_page_t 940 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 941 { 942 struct vm_phys_fictitious_seg tmp, *seg; 943 vm_page_t m; 944 945 m = NULL; 946 tmp.start = pa; 947 tmp.end = 0; 948 949 rw_rlock(&vm_phys_fictitious_reg_lock); 950 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 951 rw_runlock(&vm_phys_fictitious_reg_lock); 952 if (seg == NULL) 953 return (NULL); 954 955 m = &seg->first_page[atop(pa - seg->start)]; 956 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 957 958 return (m); 959 } 960 961 static inline void 962 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 963 long page_count, vm_memattr_t memattr) 964 { 965 long i; 966 967 bzero(range, page_count * sizeof(*range)); 968 for (i = 0; i < page_count; i++) { 969 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 970 range[i].oflags &= ~VPO_UNMANAGED; 971 range[i].busy_lock = VPB_UNBUSIED; 972 } 973 } 974 975 int 976 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 977 vm_memattr_t memattr) 978 { 979 struct vm_phys_fictitious_seg *seg; 980 vm_page_t fp; 981 long page_count; 982 #ifdef VM_PHYSSEG_DENSE 983 long pi, pe; 984 long dpage_count; 985 #endif 986 987 KASSERT(start < end, 988 ("Start of segment isn't less than end (start: %jx end: %jx)", 989 (uintmax_t)start, (uintmax_t)end)); 990 991 page_count = (end - start) / PAGE_SIZE; 992 993 #ifdef VM_PHYSSEG_DENSE 994 pi = atop(start); 995 pe = atop(end); 996 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 997 fp = &vm_page_array[pi - first_page]; 998 if ((pe - first_page) > vm_page_array_size) { 999 /* 1000 * We have a segment that starts inside 1001 * of vm_page_array, but ends outside of it. 1002 * 1003 * Use vm_page_array pages for those that are 1004 * inside of the vm_page_array range, and 1005 * allocate the remaining ones. 1006 */ 1007 dpage_count = vm_page_array_size - (pi - first_page); 1008 vm_phys_fictitious_init_range(fp, start, dpage_count, 1009 memattr); 1010 page_count -= dpage_count; 1011 start += ptoa(dpage_count); 1012 goto alloc; 1013 } 1014 /* 1015 * We can allocate the full range from vm_page_array, 1016 * so there's no need to register the range in the tree. 1017 */ 1018 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1019 return (0); 1020 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1021 /* 1022 * We have a segment that ends inside of vm_page_array, 1023 * but starts outside of it. 1024 */ 1025 fp = &vm_page_array[0]; 1026 dpage_count = pe - first_page; 1027 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 1028 memattr); 1029 end -= ptoa(dpage_count); 1030 page_count -= dpage_count; 1031 goto alloc; 1032 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1033 /* 1034 * Trying to register a fictitious range that expands before 1035 * and after vm_page_array. 1036 */ 1037 return (EINVAL); 1038 } else { 1039 alloc: 1040 #endif 1041 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1042 M_WAITOK); 1043 #ifdef VM_PHYSSEG_DENSE 1044 } 1045 #endif 1046 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1047 1048 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1049 seg->start = start; 1050 seg->end = end; 1051 seg->first_page = fp; 1052 1053 rw_wlock(&vm_phys_fictitious_reg_lock); 1054 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1055 rw_wunlock(&vm_phys_fictitious_reg_lock); 1056 1057 return (0); 1058 } 1059 1060 void 1061 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1062 { 1063 struct vm_phys_fictitious_seg *seg, tmp; 1064 #ifdef VM_PHYSSEG_DENSE 1065 long pi, pe; 1066 #endif 1067 1068 KASSERT(start < end, 1069 ("Start of segment isn't less than end (start: %jx end: %jx)", 1070 (uintmax_t)start, (uintmax_t)end)); 1071 1072 #ifdef VM_PHYSSEG_DENSE 1073 pi = atop(start); 1074 pe = atop(end); 1075 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1076 if ((pe - first_page) <= vm_page_array_size) { 1077 /* 1078 * This segment was allocated using vm_page_array 1079 * only, there's nothing to do since those pages 1080 * were never added to the tree. 1081 */ 1082 return; 1083 } 1084 /* 1085 * We have a segment that starts inside 1086 * of vm_page_array, but ends outside of it. 1087 * 1088 * Calculate how many pages were added to the 1089 * tree and free them. 1090 */ 1091 start = ptoa(first_page + vm_page_array_size); 1092 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1093 /* 1094 * We have a segment that ends inside of vm_page_array, 1095 * but starts outside of it. 1096 */ 1097 end = ptoa(first_page); 1098 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1099 /* Since it's not possible to register such a range, panic. */ 1100 panic( 1101 "Unregistering not registered fictitious range [%#jx:%#jx]", 1102 (uintmax_t)start, (uintmax_t)end); 1103 } 1104 #endif 1105 tmp.start = start; 1106 tmp.end = 0; 1107 1108 rw_wlock(&vm_phys_fictitious_reg_lock); 1109 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1110 if (seg->start != start || seg->end != end) { 1111 rw_wunlock(&vm_phys_fictitious_reg_lock); 1112 panic( 1113 "Unregistering not registered fictitious range [%#jx:%#jx]", 1114 (uintmax_t)start, (uintmax_t)end); 1115 } 1116 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1117 rw_wunlock(&vm_phys_fictitious_reg_lock); 1118 free(seg->first_page, M_FICT_PAGES); 1119 free(seg, M_FICT_PAGES); 1120 } 1121 1122 /* 1123 * Free a contiguous, power of two-sized set of physical pages. 1124 * 1125 * The free page queues must be locked. 1126 */ 1127 void 1128 vm_phys_free_pages(vm_page_t m, int order) 1129 { 1130 struct vm_freelist *fl; 1131 struct vm_phys_seg *seg; 1132 vm_paddr_t pa; 1133 vm_page_t m_buddy; 1134 1135 KASSERT(m->order == VM_NFREEORDER, 1136 ("vm_phys_free_pages: page %p has unexpected order %d", 1137 m, m->order)); 1138 KASSERT(m->pool < VM_NFREEPOOL, 1139 ("vm_phys_free_pages: page %p has unexpected pool %d", 1140 m, m->pool)); 1141 KASSERT(order < VM_NFREEORDER, 1142 ("vm_phys_free_pages: order %d is out of range", order)); 1143 seg = &vm_phys_segs[m->segind]; 1144 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1145 if (order < VM_NFREEORDER - 1) { 1146 pa = VM_PAGE_TO_PHYS(m); 1147 do { 1148 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1149 if (pa < seg->start || pa >= seg->end) 1150 break; 1151 m_buddy = &seg->first_page[atop(pa - seg->start)]; 1152 if (m_buddy->order != order) 1153 break; 1154 fl = (*seg->free_queues)[m_buddy->pool]; 1155 vm_freelist_rem(fl, m_buddy, order); 1156 if (m_buddy->pool != m->pool) 1157 vm_phys_set_pool(m->pool, m_buddy, order); 1158 order++; 1159 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1160 m = &seg->first_page[atop(pa - seg->start)]; 1161 } while (order < VM_NFREEORDER - 1); 1162 } 1163 fl = (*seg->free_queues)[m->pool]; 1164 vm_freelist_add(fl, m, order, 1); 1165 } 1166 1167 /* 1168 * Return the largest possible order of a set of pages starting at m. 1169 */ 1170 static int 1171 max_order(vm_page_t m) 1172 { 1173 1174 /* 1175 * Unsigned "min" is used here so that "order" is assigned 1176 * "VM_NFREEORDER - 1" when "m"'s physical address is zero 1177 * or the low-order bits of its physical address are zero 1178 * because the size of a physical address exceeds the size of 1179 * a long. 1180 */ 1181 return (min(ffsll(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, 1182 VM_NFREEORDER - 1)); 1183 } 1184 1185 /* 1186 * Free a contiguous, arbitrarily sized set of physical pages, without 1187 * merging across set boundaries. 1188 * 1189 * The free page queues must be locked. 1190 */ 1191 void 1192 vm_phys_enqueue_contig(vm_page_t m, u_long npages) 1193 { 1194 struct vm_freelist *fl; 1195 struct vm_phys_seg *seg; 1196 vm_page_t m_end; 1197 vm_paddr_t diff, lo; 1198 int order; 1199 1200 /* 1201 * Avoid unnecessary coalescing by freeing the pages in the largest 1202 * possible power-of-two-sized subsets. 1203 */ 1204 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1205 seg = &vm_phys_segs[m->segind]; 1206 fl = (*seg->free_queues)[m->pool]; 1207 m_end = m + npages; 1208 /* Free blocks of increasing size. */ 1209 lo = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT; 1210 if (m < m_end && 1211 (diff = lo ^ (lo + npages - 1)) != 0) { 1212 order = min(flsll(diff) - 1, VM_NFREEORDER - 1); 1213 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1); 1214 } 1215 1216 /* Free blocks of maximum size. */ 1217 order = VM_NFREEORDER - 1; 1218 while (m + (1 << order) <= m_end) { 1219 KASSERT(seg == &vm_phys_segs[m->segind], 1220 ("%s: page range [%p,%p) spans multiple segments", 1221 __func__, m_end - npages, m)); 1222 vm_freelist_add(fl, m, order, 1); 1223 m += 1 << order; 1224 } 1225 /* Free blocks of diminishing size. */ 1226 vm_phys_enq_beg(m, m_end - m, fl, 1); 1227 } 1228 1229 /* 1230 * Free a contiguous, arbitrarily sized set of physical pages. 1231 * 1232 * The free page queues must be locked. 1233 */ 1234 void 1235 vm_phys_free_contig(vm_page_t m, u_long npages) 1236 { 1237 int order_start, order_end; 1238 vm_page_t m_start, m_end; 1239 1240 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1241 1242 m_start = m; 1243 order_start = max_order(m_start); 1244 if (order_start < VM_NFREEORDER - 1) 1245 m_start += 1 << order_start; 1246 m_end = m + npages; 1247 order_end = max_order(m_end); 1248 if (order_end < VM_NFREEORDER - 1) 1249 m_end -= 1 << order_end; 1250 /* 1251 * Avoid unnecessary coalescing by freeing the pages at the start and 1252 * end of the range last. 1253 */ 1254 if (m_start < m_end) 1255 vm_phys_enqueue_contig(m_start, m_end - m_start); 1256 if (order_start < VM_NFREEORDER - 1) 1257 vm_phys_free_pages(m, order_start); 1258 if (order_end < VM_NFREEORDER - 1) 1259 vm_phys_free_pages(m_end, order_end); 1260 } 1261 1262 /* 1263 * Identify the first address range within segment segind or greater 1264 * that matches the domain, lies within the low/high range, and has 1265 * enough pages. Return -1 if there is none. 1266 */ 1267 int 1268 vm_phys_find_range(vm_page_t bounds[], int segind, int domain, 1269 u_long npages, vm_paddr_t low, vm_paddr_t high) 1270 { 1271 vm_paddr_t pa_end, pa_start; 1272 struct vm_phys_seg *end_seg, *seg; 1273 1274 KASSERT(npages > 0, ("npages is zero")); 1275 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range")); 1276 end_seg = &vm_phys_segs[vm_phys_nsegs]; 1277 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) { 1278 if (seg->domain != domain) 1279 continue; 1280 if (seg->start >= high) 1281 return (-1); 1282 pa_start = MAX(low, seg->start); 1283 pa_end = MIN(high, seg->end); 1284 if (pa_end - pa_start < ptoa(npages)) 1285 continue; 1286 bounds[0] = &seg->first_page[atop(pa_start - seg->start)]; 1287 bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; 1288 return (seg - vm_phys_segs); 1289 } 1290 return (-1); 1291 } 1292 1293 /* 1294 * Search for the given physical page "m" in the free lists. If the search 1295 * succeeds, remove "m" from the free lists and return true. Otherwise, return 1296 * false, indicating that "m" is not in the free lists. 1297 * 1298 * The free page queues must be locked. 1299 */ 1300 bool 1301 vm_phys_unfree_page(vm_page_t m) 1302 { 1303 struct vm_freelist *fl; 1304 struct vm_phys_seg *seg; 1305 vm_paddr_t pa, pa_half; 1306 vm_page_t m_set, m_tmp; 1307 int order; 1308 1309 /* 1310 * First, find the contiguous, power of two-sized set of free 1311 * physical pages containing the given physical page "m" and 1312 * assign it to "m_set". 1313 */ 1314 seg = &vm_phys_segs[m->segind]; 1315 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1316 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1317 order < VM_NFREEORDER - 1; ) { 1318 order++; 1319 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1320 if (pa >= seg->start) 1321 m_set = &seg->first_page[atop(pa - seg->start)]; 1322 else 1323 return (false); 1324 } 1325 if (m_set->order < order) 1326 return (false); 1327 if (m_set->order == VM_NFREEORDER) 1328 return (false); 1329 KASSERT(m_set->order < VM_NFREEORDER, 1330 ("vm_phys_unfree_page: page %p has unexpected order %d", 1331 m_set, m_set->order)); 1332 1333 /* 1334 * Next, remove "m_set" from the free lists. Finally, extract 1335 * "m" from "m_set" using an iterative algorithm: While "m_set" 1336 * is larger than a page, shrink "m_set" by returning the half 1337 * of "m_set" that does not contain "m" to the free lists. 1338 */ 1339 fl = (*seg->free_queues)[m_set->pool]; 1340 order = m_set->order; 1341 vm_freelist_rem(fl, m_set, order); 1342 while (order > 0) { 1343 order--; 1344 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1345 if (m->phys_addr < pa_half) 1346 m_tmp = &seg->first_page[atop(pa_half - seg->start)]; 1347 else { 1348 m_tmp = m_set; 1349 m_set = &seg->first_page[atop(pa_half - seg->start)]; 1350 } 1351 vm_freelist_add(fl, m_tmp, order, 0); 1352 } 1353 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1354 return (true); 1355 } 1356 1357 /* 1358 * Find a run of contiguous physical pages from the specified page list. 1359 */ 1360 static vm_page_t 1361 vm_phys_find_freelist_contig(struct vm_freelist *fl, int oind, u_long npages, 1362 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1363 { 1364 struct vm_phys_seg *seg; 1365 vm_paddr_t frag, lbound, pa, page_size, pa_end, pa_pre, size; 1366 vm_page_t m, m_listed, m_ret; 1367 int order; 1368 1369 KASSERT(npages > 0, ("npages is 0")); 1370 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1371 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1372 /* Search for a run satisfying the specified conditions. */ 1373 page_size = PAGE_SIZE; 1374 size = npages << PAGE_SHIFT; 1375 frag = (npages & ~(~0UL << oind)) << PAGE_SHIFT; 1376 TAILQ_FOREACH(m_listed, &fl[oind].pl, listq) { 1377 /* 1378 * Determine if the address range starting at pa is 1379 * too low. 1380 */ 1381 pa = VM_PAGE_TO_PHYS(m_listed); 1382 if (pa < low) 1383 continue; 1384 1385 /* 1386 * If this is not the first free oind-block in this range, bail 1387 * out. We have seen the first free block already, or will see 1388 * it before failing to find an appropriate range. 1389 */ 1390 seg = &vm_phys_segs[m_listed->segind]; 1391 lbound = low > seg->start ? low : seg->start; 1392 pa_pre = pa - (page_size << oind); 1393 m = &seg->first_page[atop(pa_pre - seg->start)]; 1394 if (pa != 0 && pa_pre >= lbound && m->order == oind) 1395 continue; 1396 1397 if (!vm_addr_align_ok(pa, alignment)) 1398 /* Advance to satisfy alignment condition. */ 1399 pa = roundup2(pa, alignment); 1400 else if (frag != 0 && lbound + frag <= pa) { 1401 /* 1402 * Back up to the first aligned free block in this 1403 * range, without moving below lbound. 1404 */ 1405 pa_end = pa; 1406 for (order = oind - 1; order >= 0; order--) { 1407 pa_pre = pa_end - (page_size << order); 1408 if (!vm_addr_align_ok(pa_pre, alignment)) 1409 break; 1410 m = &seg->first_page[atop(pa_pre - seg->start)]; 1411 if (pa_pre >= lbound && m->order == order) 1412 pa_end = pa_pre; 1413 } 1414 /* 1415 * If the extra small blocks are enough to complete the 1416 * fragment, use them. Otherwise, look to allocate the 1417 * fragment at the other end. 1418 */ 1419 if (pa_end + frag <= pa) 1420 pa = pa_end; 1421 } 1422 1423 /* Advance as necessary to satisfy boundary conditions. */ 1424 if (!vm_addr_bound_ok(pa, size, boundary)) 1425 pa = roundup2(pa + 1, boundary); 1426 pa_end = pa + size; 1427 1428 /* 1429 * Determine if the address range is valid (without overflow in 1430 * pa_end calculation), and fits within the segment. 1431 */ 1432 if (pa_end < pa || seg->end < pa_end) 1433 continue; 1434 1435 m_ret = &seg->first_page[atop(pa - seg->start)]; 1436 1437 /* 1438 * Determine whether there are enough free oind-blocks here to 1439 * satisfy the allocation request. 1440 */ 1441 pa = VM_PAGE_TO_PHYS(m_listed); 1442 do { 1443 pa += page_size << oind; 1444 if (pa >= pa_end) 1445 return (m_ret); 1446 m = &seg->first_page[atop(pa - seg->start)]; 1447 } while (oind == m->order); 1448 1449 /* 1450 * Determine if an additional series of free blocks of 1451 * diminishing size can help to satisfy the allocation request. 1452 */ 1453 while (m->order < oind && 1454 pa + 2 * (page_size << m->order) > pa_end) { 1455 pa += page_size << m->order; 1456 if (pa >= pa_end) 1457 return (m_ret); 1458 m = &seg->first_page[atop(pa - seg->start)]; 1459 } 1460 } 1461 return (NULL); 1462 } 1463 1464 /* 1465 * Find a run of contiguous physical pages from the specified free list 1466 * table. 1467 */ 1468 static vm_page_t 1469 vm_phys_find_queues_contig( 1470 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], 1471 u_long npages, vm_paddr_t low, vm_paddr_t high, 1472 u_long alignment, vm_paddr_t boundary) 1473 { 1474 struct vm_freelist *fl; 1475 vm_page_t m_ret; 1476 vm_paddr_t pa, pa_end, size; 1477 int oind, order, pind; 1478 1479 KASSERT(npages > 0, ("npages is 0")); 1480 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1481 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1482 /* Compute the queue that is the best fit for npages. */ 1483 order = flsl(npages - 1); 1484 /* Search for a large enough free block. */ 1485 size = npages << PAGE_SHIFT; 1486 for (oind = order; oind < VM_NFREEORDER; oind++) { 1487 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1488 fl = (*queues)[pind]; 1489 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1490 /* 1491 * Determine if the address range starting at pa 1492 * is within the given range, satisfies the 1493 * given alignment, and does not cross the given 1494 * boundary. 1495 */ 1496 pa = VM_PAGE_TO_PHYS(m_ret); 1497 pa_end = pa + size; 1498 if (low <= pa && pa_end <= high && 1499 vm_addr_ok(pa, size, alignment, boundary)) 1500 return (m_ret); 1501 } 1502 } 1503 } 1504 if (order < VM_NFREEORDER) 1505 return (NULL); 1506 /* Search for a long-enough sequence of small blocks. */ 1507 oind = VM_NFREEORDER - 1; 1508 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1509 fl = (*queues)[pind]; 1510 m_ret = vm_phys_find_freelist_contig(fl, oind, npages, 1511 low, high, alignment, boundary); 1512 if (m_ret != NULL) 1513 return (m_ret); 1514 } 1515 return (NULL); 1516 } 1517 1518 /* 1519 * Allocate a contiguous set of physical pages of the given size 1520 * "npages" from the free lists. All of the physical pages must be at 1521 * or above the given physical address "low" and below the given 1522 * physical address "high". The given value "alignment" determines the 1523 * alignment of the first physical page in the set. If the given value 1524 * "boundary" is non-zero, then the set of physical pages cannot cross 1525 * any physical address boundary that is a multiple of that value. Both 1526 * "alignment" and "boundary" must be a power of two. 1527 */ 1528 vm_page_t 1529 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1530 u_long alignment, vm_paddr_t boundary) 1531 { 1532 vm_paddr_t pa_end, pa_start; 1533 struct vm_freelist *fl; 1534 vm_page_t m, m_run; 1535 struct vm_phys_seg *seg; 1536 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; 1537 int oind, segind; 1538 1539 KASSERT(npages > 0, ("npages is 0")); 1540 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1541 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1542 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1543 if (low >= high) 1544 return (NULL); 1545 queues = NULL; 1546 m_run = NULL; 1547 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1548 seg = &vm_phys_segs[segind]; 1549 if (seg->start >= high || seg->domain != domain) 1550 continue; 1551 if (low >= seg->end) 1552 break; 1553 if (low <= seg->start) 1554 pa_start = seg->start; 1555 else 1556 pa_start = low; 1557 if (high < seg->end) 1558 pa_end = high; 1559 else 1560 pa_end = seg->end; 1561 if (pa_end - pa_start < ptoa(npages)) 1562 continue; 1563 /* 1564 * If a previous segment led to a search using 1565 * the same free lists as would this segment, then 1566 * we've actually already searched within this 1567 * too. So skip it. 1568 */ 1569 if (seg->free_queues == queues) 1570 continue; 1571 queues = seg->free_queues; 1572 m_run = vm_phys_find_queues_contig(queues, npages, 1573 low, high, alignment, boundary); 1574 if (m_run != NULL) 1575 break; 1576 } 1577 if (m_run == NULL) 1578 return (NULL); 1579 1580 /* Allocate pages from the page-range found. */ 1581 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { 1582 fl = (*queues)[m->pool]; 1583 oind = m->order; 1584 vm_freelist_rem(fl, m, oind); 1585 if (m->pool != VM_FREEPOOL_DEFAULT) 1586 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1587 } 1588 /* Return excess pages to the free lists. */ 1589 fl = (*queues)[VM_FREEPOOL_DEFAULT]; 1590 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0); 1591 return (m_run); 1592 } 1593 1594 /* 1595 * Return the index of the first unused slot which may be the terminating 1596 * entry. 1597 */ 1598 static int 1599 vm_phys_avail_count(void) 1600 { 1601 int i; 1602 1603 for (i = 0; phys_avail[i + 1]; i += 2) 1604 continue; 1605 if (i > PHYS_AVAIL_ENTRIES) 1606 panic("Improperly terminated phys_avail %d entries", i); 1607 1608 return (i); 1609 } 1610 1611 /* 1612 * Assert that a phys_avail entry is valid. 1613 */ 1614 static void 1615 vm_phys_avail_check(int i) 1616 { 1617 if (phys_avail[i] & PAGE_MASK) 1618 panic("Unaligned phys_avail[%d]: %#jx", i, 1619 (intmax_t)phys_avail[i]); 1620 if (phys_avail[i+1] & PAGE_MASK) 1621 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1622 (intmax_t)phys_avail[i]); 1623 if (phys_avail[i + 1] < phys_avail[i]) 1624 panic("phys_avail[%d] start %#jx < end %#jx", i, 1625 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1626 } 1627 1628 /* 1629 * Return the index of an overlapping phys_avail entry or -1. 1630 */ 1631 #ifdef NUMA 1632 static int 1633 vm_phys_avail_find(vm_paddr_t pa) 1634 { 1635 int i; 1636 1637 for (i = 0; phys_avail[i + 1]; i += 2) 1638 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1639 return (i); 1640 return (-1); 1641 } 1642 #endif 1643 1644 /* 1645 * Return the index of the largest entry. 1646 */ 1647 int 1648 vm_phys_avail_largest(void) 1649 { 1650 vm_paddr_t sz, largesz; 1651 int largest; 1652 int i; 1653 1654 largest = 0; 1655 largesz = 0; 1656 for (i = 0; phys_avail[i + 1]; i += 2) { 1657 sz = vm_phys_avail_size(i); 1658 if (sz > largesz) { 1659 largesz = sz; 1660 largest = i; 1661 } 1662 } 1663 1664 return (largest); 1665 } 1666 1667 vm_paddr_t 1668 vm_phys_avail_size(int i) 1669 { 1670 1671 return (phys_avail[i + 1] - phys_avail[i]); 1672 } 1673 1674 /* 1675 * Split an entry at the address 'pa'. Return zero on success or errno. 1676 */ 1677 static int 1678 vm_phys_avail_split(vm_paddr_t pa, int i) 1679 { 1680 int cnt; 1681 1682 vm_phys_avail_check(i); 1683 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1684 panic("vm_phys_avail_split: invalid address"); 1685 cnt = vm_phys_avail_count(); 1686 if (cnt >= PHYS_AVAIL_ENTRIES) 1687 return (ENOSPC); 1688 memmove(&phys_avail[i + 2], &phys_avail[i], 1689 (cnt - i) * sizeof(phys_avail[0])); 1690 phys_avail[i + 1] = pa; 1691 phys_avail[i + 2] = pa; 1692 vm_phys_avail_check(i); 1693 vm_phys_avail_check(i+2); 1694 1695 return (0); 1696 } 1697 1698 /* 1699 * Check if a given physical address can be included as part of a crash dump. 1700 */ 1701 bool 1702 vm_phys_is_dumpable(vm_paddr_t pa) 1703 { 1704 vm_page_t m; 1705 int i; 1706 1707 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1708 return ((m->flags & PG_NODUMP) == 0); 1709 1710 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1711 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1712 return (true); 1713 } 1714 return (false); 1715 } 1716 1717 void 1718 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1719 { 1720 struct vm_phys_seg *seg; 1721 1722 if (vm_phys_early_nsegs == -1) 1723 panic("%s: called after initialization", __func__); 1724 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1725 panic("%s: ran out of early segments", __func__); 1726 1727 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1728 seg->start = start; 1729 seg->end = end; 1730 } 1731 1732 /* 1733 * This routine allocates NUMA node specific memory before the page 1734 * allocator is bootstrapped. 1735 */ 1736 vm_paddr_t 1737 vm_phys_early_alloc(int domain, size_t alloc_size) 1738 { 1739 #ifdef NUMA 1740 int mem_index; 1741 #endif 1742 int i, biggestone; 1743 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1744 1745 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1746 ("%s: invalid domain index %d", __func__, domain)); 1747 1748 /* 1749 * Search the mem_affinity array for the biggest address 1750 * range in the desired domain. This is used to constrain 1751 * the phys_avail selection below. 1752 */ 1753 biggestsize = 0; 1754 mem_start = 0; 1755 mem_end = -1; 1756 #ifdef NUMA 1757 mem_index = 0; 1758 if (mem_affinity != NULL) { 1759 for (i = 0;; i++) { 1760 size = mem_affinity[i].end - mem_affinity[i].start; 1761 if (size == 0) 1762 break; 1763 if (domain != -1 && mem_affinity[i].domain != domain) 1764 continue; 1765 if (size > biggestsize) { 1766 mem_index = i; 1767 biggestsize = size; 1768 } 1769 } 1770 mem_start = mem_affinity[mem_index].start; 1771 mem_end = mem_affinity[mem_index].end; 1772 } 1773 #endif 1774 1775 /* 1776 * Now find biggest physical segment in within the desired 1777 * numa domain. 1778 */ 1779 biggestsize = 0; 1780 biggestone = 0; 1781 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1782 /* skip regions that are out of range */ 1783 if (phys_avail[i+1] - alloc_size < mem_start || 1784 phys_avail[i+1] > mem_end) 1785 continue; 1786 size = vm_phys_avail_size(i); 1787 if (size > biggestsize) { 1788 biggestone = i; 1789 biggestsize = size; 1790 } 1791 } 1792 alloc_size = round_page(alloc_size); 1793 1794 /* 1795 * Grab single pages from the front to reduce fragmentation. 1796 */ 1797 if (alloc_size == PAGE_SIZE) { 1798 pa = phys_avail[biggestone]; 1799 phys_avail[biggestone] += PAGE_SIZE; 1800 vm_phys_avail_check(biggestone); 1801 return (pa); 1802 } 1803 1804 /* 1805 * Naturally align large allocations. 1806 */ 1807 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1808 if (alloc_size + align > biggestsize) 1809 panic("cannot find a large enough size\n"); 1810 if (align != 0 && 1811 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1812 biggestone) != 0) 1813 /* Wasting memory. */ 1814 phys_avail[biggestone + 1] -= align; 1815 1816 phys_avail[biggestone + 1] -= alloc_size; 1817 vm_phys_avail_check(biggestone); 1818 pa = phys_avail[biggestone + 1]; 1819 return (pa); 1820 } 1821 1822 void 1823 vm_phys_early_startup(void) 1824 { 1825 struct vm_phys_seg *seg; 1826 int i; 1827 1828 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1829 phys_avail[i] = round_page(phys_avail[i]); 1830 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1831 } 1832 1833 for (i = 0; i < vm_phys_early_nsegs; i++) { 1834 seg = &vm_phys_early_segs[i]; 1835 vm_phys_add_seg(seg->start, seg->end); 1836 } 1837 vm_phys_early_nsegs = -1; 1838 1839 #ifdef NUMA 1840 /* Force phys_avail to be split by domain. */ 1841 if (mem_affinity != NULL) { 1842 int idx; 1843 1844 for (i = 0; mem_affinity[i].end != 0; i++) { 1845 idx = vm_phys_avail_find(mem_affinity[i].start); 1846 if (idx != -1 && 1847 phys_avail[idx] != mem_affinity[i].start) 1848 vm_phys_avail_split(mem_affinity[i].start, idx); 1849 idx = vm_phys_avail_find(mem_affinity[i].end); 1850 if (idx != -1 && 1851 phys_avail[idx] != mem_affinity[i].end) 1852 vm_phys_avail_split(mem_affinity[i].end, idx); 1853 } 1854 } 1855 #endif 1856 } 1857 1858 #ifdef DDB 1859 /* 1860 * Show the number of physical pages in each of the free lists. 1861 */ 1862 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) 1863 { 1864 struct vm_freelist *fl; 1865 int flind, oind, pind, dom; 1866 1867 for (dom = 0; dom < vm_ndomains; dom++) { 1868 db_printf("DOMAIN: %d\n", dom); 1869 for (flind = 0; flind < vm_nfreelists; flind++) { 1870 db_printf("FREE LIST %d:\n" 1871 "\n ORDER (SIZE) | NUMBER" 1872 "\n ", flind); 1873 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1874 db_printf(" | POOL %d", pind); 1875 db_printf("\n-- "); 1876 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1877 db_printf("-- -- "); 1878 db_printf("--\n"); 1879 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 1880 db_printf(" %2.2d (%6.6dK)", oind, 1881 1 << (PAGE_SHIFT - 10 + oind)); 1882 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1883 fl = vm_phys_free_queues[dom][flind][pind]; 1884 db_printf(" | %6.6d", fl[oind].lcnt); 1885 } 1886 db_printf("\n"); 1887 } 1888 db_printf("\n"); 1889 } 1890 db_printf("\n"); 1891 } 1892 } 1893 #endif 1894