1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ddb.h" 45 #include "opt_vm.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/domainset.h> 50 #include <sys/lock.h> 51 #include <sys/kernel.h> 52 #include <sys/malloc.h> 53 #include <sys/mutex.h> 54 #include <sys/proc.h> 55 #include <sys/queue.h> 56 #include <sys/rwlock.h> 57 #include <sys/sbuf.h> 58 #include <sys/sysctl.h> 59 #include <sys/tree.h> 60 #include <sys/vmmeter.h> 61 62 #include <ddb/ddb.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_extern.h> 66 #include <vm/vm_param.h> 67 #include <vm/vm_kern.h> 68 #include <vm/vm_object.h> 69 #include <vm/vm_page.h> 70 #include <vm/vm_phys.h> 71 #include <vm/vm_pagequeue.h> 72 73 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 74 "Too many physsegs."); 75 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t), 76 "vm_paddr_t too big for ffsll, flsll."); 77 78 #ifdef NUMA 79 struct mem_affinity __read_mostly *mem_affinity; 80 int __read_mostly *mem_locality; 81 #endif 82 83 int __read_mostly vm_ndomains = 1; 84 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); 85 86 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; 87 int __read_mostly vm_phys_nsegs; 88 static struct vm_phys_seg vm_phys_early_segs[8]; 89 static int vm_phys_early_nsegs; 90 91 struct vm_phys_fictitious_seg; 92 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 93 struct vm_phys_fictitious_seg *); 94 95 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 96 RB_INITIALIZER(&vm_phys_fictitious_tree); 97 98 struct vm_phys_fictitious_seg { 99 RB_ENTRY(vm_phys_fictitious_seg) node; 100 /* Memory region data */ 101 vm_paddr_t start; 102 vm_paddr_t end; 103 vm_page_t first_page; 104 }; 105 106 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 107 vm_phys_fictitious_cmp); 108 109 static struct rwlock_padalign vm_phys_fictitious_reg_lock; 110 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 111 112 static struct vm_freelist __aligned(CACHE_LINE_SIZE) 113 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] 114 [VM_NFREEORDER_MAX]; 115 116 static int __read_mostly vm_nfreelists; 117 118 /* 119 * These "avail lists" are globals used to communicate boot-time physical 120 * memory layout to other parts of the kernel. Each physically contiguous 121 * region of memory is defined by a start address at an even index and an 122 * end address at the following odd index. Each list is terminated by a 123 * pair of zero entries. 124 * 125 * dump_avail tells the dump code what regions to include in a crash dump, and 126 * phys_avail is all of the remaining physical memory that is available for 127 * the vm system. 128 * 129 * Initially dump_avail and phys_avail are identical. Boot time memory 130 * allocations remove extents from phys_avail that may still be included 131 * in dumps. 132 */ 133 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; 134 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; 135 136 /* 137 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 138 */ 139 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; 140 141 CTASSERT(VM_FREELIST_DEFAULT == 0); 142 143 #ifdef VM_FREELIST_DMA32 144 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 145 #endif 146 147 /* 148 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 149 * the ordering of the free list boundaries. 150 */ 151 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 152 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 153 #endif 154 155 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 156 SYSCTL_OID(_vm, OID_AUTO, phys_free, 157 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 158 sysctl_vm_phys_free, "A", 159 "Phys Free Info"); 160 161 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 162 SYSCTL_OID(_vm, OID_AUTO, phys_segs, 163 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 164 sysctl_vm_phys_segs, "A", 165 "Phys Seg Info"); 166 167 #ifdef NUMA 168 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 169 SYSCTL_OID(_vm, OID_AUTO, phys_locality, 170 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 171 sysctl_vm_phys_locality, "A", 172 "Phys Locality Info"); 173 #endif 174 175 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 176 &vm_ndomains, 0, "Number of physical memory domains available."); 177 178 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 179 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 180 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 181 int order, int tail); 182 183 /* 184 * Red-black tree helpers for vm fictitious range management. 185 */ 186 static inline int 187 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 188 struct vm_phys_fictitious_seg *range) 189 { 190 191 KASSERT(range->start != 0 && range->end != 0, 192 ("Invalid range passed on search for vm_fictitious page")); 193 if (p->start >= range->end) 194 return (1); 195 if (p->start < range->start) 196 return (-1); 197 198 return (0); 199 } 200 201 static int 202 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 203 struct vm_phys_fictitious_seg *p2) 204 { 205 206 /* Check if this is a search for a page */ 207 if (p1->end == 0) 208 return (vm_phys_fictitious_in_range(p1, p2)); 209 210 KASSERT(p2->end != 0, 211 ("Invalid range passed as second parameter to vm fictitious comparison")); 212 213 /* Searching to add a new range */ 214 if (p1->end <= p2->start) 215 return (-1); 216 if (p1->start >= p2->end) 217 return (1); 218 219 panic("Trying to add overlapping vm fictitious ranges:\n" 220 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 221 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 222 } 223 224 int 225 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high) 226 { 227 #ifdef NUMA 228 domainset_t mask; 229 int i; 230 231 if (vm_ndomains == 1 || mem_affinity == NULL) 232 return (0); 233 234 DOMAINSET_ZERO(&mask); 235 /* 236 * Check for any memory that overlaps low, high. 237 */ 238 for (i = 0; mem_affinity[i].end != 0; i++) 239 if (mem_affinity[i].start <= high && 240 mem_affinity[i].end >= low) 241 DOMAINSET_SET(mem_affinity[i].domain, &mask); 242 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) 243 return (prefer); 244 if (DOMAINSET_EMPTY(&mask)) 245 panic("vm_phys_domain_match: Impossible constraint"); 246 return (DOMAINSET_FFS(&mask) - 1); 247 #else 248 return (0); 249 #endif 250 } 251 252 /* 253 * Outputs the state of the physical memory allocator, specifically, 254 * the amount of physical memory in each free list. 255 */ 256 static int 257 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 258 { 259 struct sbuf sbuf; 260 struct vm_freelist *fl; 261 int dom, error, flind, oind, pind; 262 263 error = sysctl_wire_old_buffer(req, 0); 264 if (error != 0) 265 return (error); 266 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 267 for (dom = 0; dom < vm_ndomains; dom++) { 268 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 269 for (flind = 0; flind < vm_nfreelists; flind++) { 270 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 271 "\n ORDER (SIZE) | NUMBER" 272 "\n ", flind); 273 for (pind = 0; pind < VM_NFREEPOOL; pind++) 274 sbuf_printf(&sbuf, " | POOL %d", pind); 275 sbuf_printf(&sbuf, "\n-- "); 276 for (pind = 0; pind < VM_NFREEPOOL; pind++) 277 sbuf_printf(&sbuf, "-- -- "); 278 sbuf_printf(&sbuf, "--\n"); 279 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 280 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 281 1 << (PAGE_SHIFT - 10 + oind)); 282 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 283 fl = vm_phys_free_queues[dom][flind][pind]; 284 sbuf_printf(&sbuf, " | %6d", 285 fl[oind].lcnt); 286 } 287 sbuf_printf(&sbuf, "\n"); 288 } 289 } 290 } 291 error = sbuf_finish(&sbuf); 292 sbuf_delete(&sbuf); 293 return (error); 294 } 295 296 /* 297 * Outputs the set of physical memory segments. 298 */ 299 static int 300 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 301 { 302 struct sbuf sbuf; 303 struct vm_phys_seg *seg; 304 int error, segind; 305 306 error = sysctl_wire_old_buffer(req, 0); 307 if (error != 0) 308 return (error); 309 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 310 for (segind = 0; segind < vm_phys_nsegs; segind++) { 311 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 312 seg = &vm_phys_segs[segind]; 313 sbuf_printf(&sbuf, "start: %#jx\n", 314 (uintmax_t)seg->start); 315 sbuf_printf(&sbuf, "end: %#jx\n", 316 (uintmax_t)seg->end); 317 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 318 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 319 } 320 error = sbuf_finish(&sbuf); 321 sbuf_delete(&sbuf); 322 return (error); 323 } 324 325 /* 326 * Return affinity, or -1 if there's no affinity information. 327 */ 328 int 329 vm_phys_mem_affinity(int f, int t) 330 { 331 332 #ifdef NUMA 333 if (mem_locality == NULL) 334 return (-1); 335 if (f >= vm_ndomains || t >= vm_ndomains) 336 return (-1); 337 return (mem_locality[f * vm_ndomains + t]); 338 #else 339 return (-1); 340 #endif 341 } 342 343 #ifdef NUMA 344 /* 345 * Outputs the VM locality table. 346 */ 347 static int 348 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 349 { 350 struct sbuf sbuf; 351 int error, i, j; 352 353 error = sysctl_wire_old_buffer(req, 0); 354 if (error != 0) 355 return (error); 356 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 357 358 sbuf_printf(&sbuf, "\n"); 359 360 for (i = 0; i < vm_ndomains; i++) { 361 sbuf_printf(&sbuf, "%d: ", i); 362 for (j = 0; j < vm_ndomains; j++) { 363 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 364 } 365 sbuf_printf(&sbuf, "\n"); 366 } 367 error = sbuf_finish(&sbuf); 368 sbuf_delete(&sbuf); 369 return (error); 370 } 371 #endif 372 373 static void 374 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 375 { 376 377 m->order = order; 378 if (tail) 379 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); 380 else 381 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); 382 fl[order].lcnt++; 383 } 384 385 static void 386 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 387 { 388 389 TAILQ_REMOVE(&fl[order].pl, m, listq); 390 fl[order].lcnt--; 391 m->order = VM_NFREEORDER; 392 } 393 394 /* 395 * Create a physical memory segment. 396 */ 397 static void 398 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 399 { 400 struct vm_phys_seg *seg; 401 402 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 403 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 404 KASSERT(domain >= 0 && domain < vm_ndomains, 405 ("vm_phys_create_seg: invalid domain provided")); 406 seg = &vm_phys_segs[vm_phys_nsegs++]; 407 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 408 *seg = *(seg - 1); 409 seg--; 410 } 411 seg->start = start; 412 seg->end = end; 413 seg->domain = domain; 414 } 415 416 static void 417 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 418 { 419 #ifdef NUMA 420 int i; 421 422 if (mem_affinity == NULL) { 423 _vm_phys_create_seg(start, end, 0); 424 return; 425 } 426 427 for (i = 0;; i++) { 428 if (mem_affinity[i].end == 0) 429 panic("Reached end of affinity info"); 430 if (mem_affinity[i].end <= start) 431 continue; 432 if (mem_affinity[i].start > start) 433 panic("No affinity info for start %jx", 434 (uintmax_t)start); 435 if (mem_affinity[i].end >= end) { 436 _vm_phys_create_seg(start, end, 437 mem_affinity[i].domain); 438 break; 439 } 440 _vm_phys_create_seg(start, mem_affinity[i].end, 441 mem_affinity[i].domain); 442 start = mem_affinity[i].end; 443 } 444 #else 445 _vm_phys_create_seg(start, end, 0); 446 #endif 447 } 448 449 /* 450 * Add a physical memory segment. 451 */ 452 void 453 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 454 { 455 vm_paddr_t paddr; 456 457 KASSERT((start & PAGE_MASK) == 0, 458 ("vm_phys_define_seg: start is not page aligned")); 459 KASSERT((end & PAGE_MASK) == 0, 460 ("vm_phys_define_seg: end is not page aligned")); 461 462 /* 463 * Split the physical memory segment if it spans two or more free 464 * list boundaries. 465 */ 466 paddr = start; 467 #ifdef VM_FREELIST_LOWMEM 468 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 469 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 470 paddr = VM_LOWMEM_BOUNDARY; 471 } 472 #endif 473 #ifdef VM_FREELIST_DMA32 474 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 475 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 476 paddr = VM_DMA32_BOUNDARY; 477 } 478 #endif 479 vm_phys_create_seg(paddr, end); 480 } 481 482 /* 483 * Initialize the physical memory allocator. 484 * 485 * Requires that vm_page_array is initialized! 486 */ 487 void 488 vm_phys_init(void) 489 { 490 struct vm_freelist *fl; 491 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; 492 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) 493 u_long npages; 494 #endif 495 int dom, flind, freelist, oind, pind, segind; 496 497 /* 498 * Compute the number of free lists, and generate the mapping from the 499 * manifest constants VM_FREELIST_* to the free list indices. 500 * 501 * Initially, the entries of vm_freelist_to_flind[] are set to either 502 * 0 or 1 to indicate which free lists should be created. 503 */ 504 #ifdef VM_DMA32_NPAGES_THRESHOLD 505 npages = 0; 506 #endif 507 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 508 seg = &vm_phys_segs[segind]; 509 #ifdef VM_FREELIST_LOWMEM 510 if (seg->end <= VM_LOWMEM_BOUNDARY) 511 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 512 else 513 #endif 514 #ifdef VM_FREELIST_DMA32 515 if ( 516 #ifdef VM_DMA32_NPAGES_THRESHOLD 517 /* 518 * Create the DMA32 free list only if the amount of 519 * physical memory above physical address 4G exceeds the 520 * given threshold. 521 */ 522 npages > VM_DMA32_NPAGES_THRESHOLD && 523 #endif 524 seg->end <= VM_DMA32_BOUNDARY) 525 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 526 else 527 #endif 528 { 529 #ifdef VM_DMA32_NPAGES_THRESHOLD 530 npages += atop(seg->end - seg->start); 531 #endif 532 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 533 } 534 } 535 /* Change each entry into a running total of the free lists. */ 536 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 537 vm_freelist_to_flind[freelist] += 538 vm_freelist_to_flind[freelist - 1]; 539 } 540 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 541 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 542 /* Change each entry into a free list index. */ 543 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 544 vm_freelist_to_flind[freelist]--; 545 546 /* 547 * Initialize the first_page and free_queues fields of each physical 548 * memory segment. 549 */ 550 #ifdef VM_PHYSSEG_SPARSE 551 npages = 0; 552 #endif 553 for (segind = 0; segind < vm_phys_nsegs; segind++) { 554 seg = &vm_phys_segs[segind]; 555 #ifdef VM_PHYSSEG_SPARSE 556 seg->first_page = &vm_page_array[npages]; 557 npages += atop(seg->end - seg->start); 558 #else 559 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 560 #endif 561 #ifdef VM_FREELIST_LOWMEM 562 if (seg->end <= VM_LOWMEM_BOUNDARY) { 563 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 564 KASSERT(flind >= 0, 565 ("vm_phys_init: LOWMEM flind < 0")); 566 } else 567 #endif 568 #ifdef VM_FREELIST_DMA32 569 if (seg->end <= VM_DMA32_BOUNDARY) { 570 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 571 KASSERT(flind >= 0, 572 ("vm_phys_init: DMA32 flind < 0")); 573 } else 574 #endif 575 { 576 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 577 KASSERT(flind >= 0, 578 ("vm_phys_init: DEFAULT flind < 0")); 579 } 580 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 581 } 582 583 /* 584 * Coalesce physical memory segments that are contiguous and share the 585 * same per-domain free queues. 586 */ 587 prev_seg = vm_phys_segs; 588 seg = &vm_phys_segs[1]; 589 end_seg = &vm_phys_segs[vm_phys_nsegs]; 590 while (seg < end_seg) { 591 if (prev_seg->end == seg->start && 592 prev_seg->free_queues == seg->free_queues) { 593 prev_seg->end = seg->end; 594 KASSERT(prev_seg->domain == seg->domain, 595 ("vm_phys_init: free queues cannot span domains")); 596 vm_phys_nsegs--; 597 end_seg--; 598 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) 599 *tmp_seg = *(tmp_seg + 1); 600 } else { 601 prev_seg = seg; 602 seg++; 603 } 604 } 605 606 /* 607 * Initialize the free queues. 608 */ 609 for (dom = 0; dom < vm_ndomains; dom++) { 610 for (flind = 0; flind < vm_nfreelists; flind++) { 611 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 612 fl = vm_phys_free_queues[dom][flind][pind]; 613 for (oind = 0; oind < VM_NFREEORDER; oind++) 614 TAILQ_INIT(&fl[oind].pl); 615 } 616 } 617 } 618 619 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 620 } 621 622 /* 623 * Register info about the NUMA topology of the system. 624 * 625 * Invoked by platform-dependent code prior to vm_phys_init(). 626 */ 627 void 628 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, 629 int *locality) 630 { 631 #ifdef NUMA 632 int d, i; 633 634 /* 635 * For now the only override value that we support is 1, which 636 * effectively disables NUMA-awareness in the allocators. 637 */ 638 d = 0; 639 TUNABLE_INT_FETCH("vm.numa.disabled", &d); 640 if (d) 641 ndomains = 1; 642 643 if (ndomains > 1) { 644 vm_ndomains = ndomains; 645 mem_affinity = affinity; 646 mem_locality = locality; 647 } 648 649 for (i = 0; i < vm_ndomains; i++) 650 DOMAINSET_SET(i, &all_domains); 651 #else 652 (void)ndomains; 653 (void)affinity; 654 (void)locality; 655 #endif 656 } 657 658 /* 659 * Split a contiguous, power of two-sized set of physical pages. 660 * 661 * When this function is called by a page allocation function, the caller 662 * should request insertion at the head unless the order [order, oind) queues 663 * are known to be empty. The objective being to reduce the likelihood of 664 * long-term fragmentation by promoting contemporaneous allocation and 665 * (hopefully) deallocation. 666 */ 667 static __inline void 668 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, 669 int tail) 670 { 671 vm_page_t m_buddy; 672 673 while (oind > order) { 674 oind--; 675 m_buddy = &m[1 << oind]; 676 KASSERT(m_buddy->order == VM_NFREEORDER, 677 ("vm_phys_split_pages: page %p has unexpected order %d", 678 m_buddy, m_buddy->order)); 679 vm_freelist_add(fl, m_buddy, oind, tail); 680 } 681 } 682 683 /* 684 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned 685 * and sized set to the specified free list. 686 * 687 * When this function is called by a page allocation function, the caller 688 * should request insertion at the head unless the lower-order queues are 689 * known to be empty. The objective being to reduce the likelihood of long- 690 * term fragmentation by promoting contemporaneous allocation and (hopefully) 691 * deallocation. 692 * 693 * The physical page m's buddy must not be free. 694 */ 695 static vm_page_t 696 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) 697 { 698 int order; 699 700 KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & 701 ((PAGE_SIZE << fls(npages / 2)) - 1)) == 0, 702 ("vm_phys_enq_range: page %p and npages %u are misaligned", 703 m, npages)); 704 while (npages > 0) { 705 KASSERT(m->order == VM_NFREEORDER, 706 ("vm_phys_enq_range: page %p has unexpected order %d", 707 m, m->order)); 708 order = ffs(npages) - 1; 709 KASSERT(order < VM_NFREEORDER, 710 ("vm_phys_enq_range: order %d is out of range", order)); 711 vm_freelist_add(fl, m, order, tail); 712 m += 1 << order; 713 npages -= 1 << order; 714 } 715 return (m); 716 } 717 718 /* 719 * Set the pool for a contiguous, power of two-sized set of physical pages. 720 */ 721 static void 722 vm_phys_set_pool(int pool, vm_page_t m, int order) 723 { 724 vm_page_t m_tmp; 725 726 for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 727 m_tmp->pool = pool; 728 } 729 730 /* 731 * Tries to allocate the specified number of pages from the specified pool 732 * within the specified domain. Returns the actual number of allocated pages 733 * and a pointer to each page through the array ma[]. 734 * 735 * The returned pages may not be physically contiguous. However, in contrast 736 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), 737 * calling this function once to allocate the desired number of pages will 738 * avoid wasted time in vm_phys_split_pages(). 739 * 740 * The free page queues for the specified domain must be locked. 741 */ 742 int 743 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) 744 { 745 struct vm_freelist *alt, *fl; 746 vm_page_t m; 747 int avail, end, flind, freelist, i, oind, pind; 748 749 KASSERT(domain >= 0 && domain < vm_ndomains, 750 ("vm_phys_alloc_npages: domain %d is out of range", domain)); 751 KASSERT(pool < VM_NFREEPOOL, 752 ("vm_phys_alloc_npages: pool %d is out of range", pool)); 753 KASSERT(npages <= 1 << (VM_NFREEORDER - 1), 754 ("vm_phys_alloc_npages: npages %d is out of range", npages)); 755 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 756 i = 0; 757 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 758 flind = vm_freelist_to_flind[freelist]; 759 if (flind < 0) 760 continue; 761 fl = vm_phys_free_queues[domain][flind][pool]; 762 for (oind = 0; oind < VM_NFREEORDER; oind++) { 763 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { 764 vm_freelist_rem(fl, m, oind); 765 avail = i + (1 << oind); 766 end = imin(npages, avail); 767 while (i < end) 768 ma[i++] = m++; 769 if (i == npages) { 770 /* 771 * Return excess pages to fl. Its order 772 * [0, oind) queues are empty. 773 */ 774 vm_phys_enq_range(m, avail - i, fl, 1); 775 return (npages); 776 } 777 } 778 } 779 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 780 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 781 alt = vm_phys_free_queues[domain][flind][pind]; 782 while ((m = TAILQ_FIRST(&alt[oind].pl)) != 783 NULL) { 784 vm_freelist_rem(alt, m, oind); 785 vm_phys_set_pool(pool, m, oind); 786 avail = i + (1 << oind); 787 end = imin(npages, avail); 788 while (i < end) 789 ma[i++] = m++; 790 if (i == npages) { 791 /* 792 * Return excess pages to fl. 793 * Its order [0, oind) queues 794 * are empty. 795 */ 796 vm_phys_enq_range(m, avail - i, 797 fl, 1); 798 return (npages); 799 } 800 } 801 } 802 } 803 } 804 return (i); 805 } 806 807 /* 808 * Allocate a contiguous, power of two-sized set of physical pages 809 * from the free lists. 810 * 811 * The free page queues must be locked. 812 */ 813 vm_page_t 814 vm_phys_alloc_pages(int domain, int pool, int order) 815 { 816 vm_page_t m; 817 int freelist; 818 819 for (freelist = 0; freelist < VM_NFREELIST; freelist++) { 820 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); 821 if (m != NULL) 822 return (m); 823 } 824 return (NULL); 825 } 826 827 /* 828 * Allocate a contiguous, power of two-sized set of physical pages from the 829 * specified free list. The free list must be specified using one of the 830 * manifest constants VM_FREELIST_*. 831 * 832 * The free page queues must be locked. 833 */ 834 vm_page_t 835 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) 836 { 837 struct vm_freelist *alt, *fl; 838 vm_page_t m; 839 int oind, pind, flind; 840 841 KASSERT(domain >= 0 && domain < vm_ndomains, 842 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 843 domain)); 844 KASSERT(freelist < VM_NFREELIST, 845 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 846 freelist)); 847 KASSERT(pool < VM_NFREEPOOL, 848 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 849 KASSERT(order < VM_NFREEORDER, 850 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 851 852 flind = vm_freelist_to_flind[freelist]; 853 /* Check if freelist is present */ 854 if (flind < 0) 855 return (NULL); 856 857 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 858 fl = &vm_phys_free_queues[domain][flind][pool][0]; 859 for (oind = order; oind < VM_NFREEORDER; oind++) { 860 m = TAILQ_FIRST(&fl[oind].pl); 861 if (m != NULL) { 862 vm_freelist_rem(fl, m, oind); 863 /* The order [order, oind) queues are empty. */ 864 vm_phys_split_pages(m, oind, fl, order, 1); 865 return (m); 866 } 867 } 868 869 /* 870 * The given pool was empty. Find the largest 871 * contiguous, power-of-two-sized set of pages in any 872 * pool. Transfer these pages to the given pool, and 873 * use them to satisfy the allocation. 874 */ 875 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 876 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 877 alt = &vm_phys_free_queues[domain][flind][pind][0]; 878 m = TAILQ_FIRST(&alt[oind].pl); 879 if (m != NULL) { 880 vm_freelist_rem(alt, m, oind); 881 vm_phys_set_pool(pool, m, oind); 882 /* The order [order, oind) queues are empty. */ 883 vm_phys_split_pages(m, oind, fl, order, 1); 884 return (m); 885 } 886 } 887 } 888 return (NULL); 889 } 890 891 /* 892 * Find the vm_page corresponding to the given physical address. 893 */ 894 vm_page_t 895 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 896 { 897 struct vm_phys_seg *seg; 898 899 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 900 return (&seg->first_page[atop(pa - seg->start)]); 901 return (NULL); 902 } 903 904 vm_page_t 905 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 906 { 907 struct vm_phys_fictitious_seg tmp, *seg; 908 vm_page_t m; 909 910 m = NULL; 911 tmp.start = pa; 912 tmp.end = 0; 913 914 rw_rlock(&vm_phys_fictitious_reg_lock); 915 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 916 rw_runlock(&vm_phys_fictitious_reg_lock); 917 if (seg == NULL) 918 return (NULL); 919 920 m = &seg->first_page[atop(pa - seg->start)]; 921 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 922 923 return (m); 924 } 925 926 static inline void 927 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 928 long page_count, vm_memattr_t memattr) 929 { 930 long i; 931 932 bzero(range, page_count * sizeof(*range)); 933 for (i = 0; i < page_count; i++) { 934 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 935 range[i].oflags &= ~VPO_UNMANAGED; 936 range[i].busy_lock = VPB_UNBUSIED; 937 } 938 } 939 940 int 941 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 942 vm_memattr_t memattr) 943 { 944 struct vm_phys_fictitious_seg *seg; 945 vm_page_t fp; 946 long page_count; 947 #ifdef VM_PHYSSEG_DENSE 948 long pi, pe; 949 long dpage_count; 950 #endif 951 952 KASSERT(start < end, 953 ("Start of segment isn't less than end (start: %jx end: %jx)", 954 (uintmax_t)start, (uintmax_t)end)); 955 956 page_count = (end - start) / PAGE_SIZE; 957 958 #ifdef VM_PHYSSEG_DENSE 959 pi = atop(start); 960 pe = atop(end); 961 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 962 fp = &vm_page_array[pi - first_page]; 963 if ((pe - first_page) > vm_page_array_size) { 964 /* 965 * We have a segment that starts inside 966 * of vm_page_array, but ends outside of it. 967 * 968 * Use vm_page_array pages for those that are 969 * inside of the vm_page_array range, and 970 * allocate the remaining ones. 971 */ 972 dpage_count = vm_page_array_size - (pi - first_page); 973 vm_phys_fictitious_init_range(fp, start, dpage_count, 974 memattr); 975 page_count -= dpage_count; 976 start += ptoa(dpage_count); 977 goto alloc; 978 } 979 /* 980 * We can allocate the full range from vm_page_array, 981 * so there's no need to register the range in the tree. 982 */ 983 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 984 return (0); 985 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 986 /* 987 * We have a segment that ends inside of vm_page_array, 988 * but starts outside of it. 989 */ 990 fp = &vm_page_array[0]; 991 dpage_count = pe - first_page; 992 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 993 memattr); 994 end -= ptoa(dpage_count); 995 page_count -= dpage_count; 996 goto alloc; 997 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 998 /* 999 * Trying to register a fictitious range that expands before 1000 * and after vm_page_array. 1001 */ 1002 return (EINVAL); 1003 } else { 1004 alloc: 1005 #endif 1006 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 1007 M_WAITOK); 1008 #ifdef VM_PHYSSEG_DENSE 1009 } 1010 #endif 1011 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 1012 1013 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 1014 seg->start = start; 1015 seg->end = end; 1016 seg->first_page = fp; 1017 1018 rw_wlock(&vm_phys_fictitious_reg_lock); 1019 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 1020 rw_wunlock(&vm_phys_fictitious_reg_lock); 1021 1022 return (0); 1023 } 1024 1025 void 1026 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 1027 { 1028 struct vm_phys_fictitious_seg *seg, tmp; 1029 #ifdef VM_PHYSSEG_DENSE 1030 long pi, pe; 1031 #endif 1032 1033 KASSERT(start < end, 1034 ("Start of segment isn't less than end (start: %jx end: %jx)", 1035 (uintmax_t)start, (uintmax_t)end)); 1036 1037 #ifdef VM_PHYSSEG_DENSE 1038 pi = atop(start); 1039 pe = atop(end); 1040 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1041 if ((pe - first_page) <= vm_page_array_size) { 1042 /* 1043 * This segment was allocated using vm_page_array 1044 * only, there's nothing to do since those pages 1045 * were never added to the tree. 1046 */ 1047 return; 1048 } 1049 /* 1050 * We have a segment that starts inside 1051 * of vm_page_array, but ends outside of it. 1052 * 1053 * Calculate how many pages were added to the 1054 * tree and free them. 1055 */ 1056 start = ptoa(first_page + vm_page_array_size); 1057 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 1058 /* 1059 * We have a segment that ends inside of vm_page_array, 1060 * but starts outside of it. 1061 */ 1062 end = ptoa(first_page); 1063 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 1064 /* Since it's not possible to register such a range, panic. */ 1065 panic( 1066 "Unregistering not registered fictitious range [%#jx:%#jx]", 1067 (uintmax_t)start, (uintmax_t)end); 1068 } 1069 #endif 1070 tmp.start = start; 1071 tmp.end = 0; 1072 1073 rw_wlock(&vm_phys_fictitious_reg_lock); 1074 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 1075 if (seg->start != start || seg->end != end) { 1076 rw_wunlock(&vm_phys_fictitious_reg_lock); 1077 panic( 1078 "Unregistering not registered fictitious range [%#jx:%#jx]", 1079 (uintmax_t)start, (uintmax_t)end); 1080 } 1081 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 1082 rw_wunlock(&vm_phys_fictitious_reg_lock); 1083 free(seg->first_page, M_FICT_PAGES); 1084 free(seg, M_FICT_PAGES); 1085 } 1086 1087 /* 1088 * Free a contiguous, power of two-sized set of physical pages. 1089 * 1090 * The free page queues must be locked. 1091 */ 1092 void 1093 vm_phys_free_pages(vm_page_t m, int order) 1094 { 1095 struct vm_freelist *fl; 1096 struct vm_phys_seg *seg; 1097 vm_paddr_t pa; 1098 vm_page_t m_buddy; 1099 1100 KASSERT(m->order == VM_NFREEORDER, 1101 ("vm_phys_free_pages: page %p has unexpected order %d", 1102 m, m->order)); 1103 KASSERT(m->pool < VM_NFREEPOOL, 1104 ("vm_phys_free_pages: page %p has unexpected pool %d", 1105 m, m->pool)); 1106 KASSERT(order < VM_NFREEORDER, 1107 ("vm_phys_free_pages: order %d is out of range", order)); 1108 seg = &vm_phys_segs[m->segind]; 1109 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1110 if (order < VM_NFREEORDER - 1) { 1111 pa = VM_PAGE_TO_PHYS(m); 1112 do { 1113 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 1114 if (pa < seg->start || pa >= seg->end) 1115 break; 1116 m_buddy = &seg->first_page[atop(pa - seg->start)]; 1117 if (m_buddy->order != order) 1118 break; 1119 fl = (*seg->free_queues)[m_buddy->pool]; 1120 vm_freelist_rem(fl, m_buddy, order); 1121 if (m_buddy->pool != m->pool) 1122 vm_phys_set_pool(m->pool, m_buddy, order); 1123 order++; 1124 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 1125 m = &seg->first_page[atop(pa - seg->start)]; 1126 } while (order < VM_NFREEORDER - 1); 1127 } 1128 fl = (*seg->free_queues)[m->pool]; 1129 vm_freelist_add(fl, m, order, 1); 1130 } 1131 1132 /* 1133 * Return the largest possible order of a set of pages starting at m. 1134 */ 1135 static int 1136 max_order(vm_page_t m) 1137 { 1138 1139 /* 1140 * Unsigned "min" is used here so that "order" is assigned 1141 * "VM_NFREEORDER - 1" when "m"'s physical address is zero 1142 * or the low-order bits of its physical address are zero 1143 * because the size of a physical address exceeds the size of 1144 * a long. 1145 */ 1146 return (min(ffsll(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, 1147 VM_NFREEORDER - 1)); 1148 } 1149 1150 /* 1151 * Free a contiguous, arbitrarily sized set of physical pages, without 1152 * merging across set boundaries. 1153 * 1154 * The free page queues must be locked. 1155 */ 1156 void 1157 vm_phys_enqueue_contig(vm_page_t m, u_long npages) 1158 { 1159 struct vm_freelist *fl; 1160 struct vm_phys_seg *seg; 1161 vm_page_t m_end; 1162 vm_paddr_t diff, lo; 1163 int order; 1164 1165 /* 1166 * Avoid unnecessary coalescing by freeing the pages in the largest 1167 * possible power-of-two-sized subsets. 1168 */ 1169 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1170 seg = &vm_phys_segs[m->segind]; 1171 fl = (*seg->free_queues)[m->pool]; 1172 m_end = m + npages; 1173 /* Free blocks of increasing size. */ 1174 lo = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT; 1175 if (m < m_end && 1176 (diff = lo ^ (lo + npages - 1)) != 0) { 1177 order = min(flsll(diff) - 1, VM_NFREEORDER - 1); 1178 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1); 1179 } 1180 1181 /* Free blocks of maximum size. */ 1182 order = VM_NFREEORDER - 1; 1183 while (m + (1 << order) <= m_end) { 1184 KASSERT(seg == &vm_phys_segs[m->segind], 1185 ("%s: page range [%p,%p) spans multiple segments", 1186 __func__, m_end - npages, m)); 1187 vm_freelist_add(fl, m, order, 1); 1188 m += 1 << order; 1189 } 1190 /* Free blocks of diminishing size. */ 1191 while (m < m_end) { 1192 KASSERT(seg == &vm_phys_segs[m->segind], 1193 ("%s: page range [%p,%p) spans multiple segments", 1194 __func__, m_end - npages, m)); 1195 order = flsl(m_end - m) - 1; 1196 vm_freelist_add(fl, m, order, 1); 1197 m += 1 << order; 1198 } 1199 } 1200 1201 /* 1202 * Free a contiguous, arbitrarily sized set of physical pages. 1203 * 1204 * The free page queues must be locked. 1205 */ 1206 void 1207 vm_phys_free_contig(vm_page_t m, u_long npages) 1208 { 1209 int order_start, order_end; 1210 vm_page_t m_start, m_end; 1211 1212 vm_domain_free_assert_locked(vm_pagequeue_domain(m)); 1213 1214 m_start = m; 1215 order_start = max_order(m_start); 1216 if (order_start < VM_NFREEORDER - 1) 1217 m_start += 1 << order_start; 1218 m_end = m + npages; 1219 order_end = max_order(m_end); 1220 if (order_end < VM_NFREEORDER - 1) 1221 m_end -= 1 << order_end; 1222 /* 1223 * Avoid unnecessary coalescing by freeing the pages at the start and 1224 * end of the range last. 1225 */ 1226 if (m_start < m_end) 1227 vm_phys_enqueue_contig(m_start, m_end - m_start); 1228 if (order_start < VM_NFREEORDER - 1) 1229 vm_phys_free_pages(m, order_start); 1230 if (order_end < VM_NFREEORDER - 1) 1231 vm_phys_free_pages(m_end, order_end); 1232 } 1233 1234 /* 1235 * Identify the first address range within segment segind or greater 1236 * that matches the domain, lies within the low/high range, and has 1237 * enough pages. Return -1 if there is none. 1238 */ 1239 int 1240 vm_phys_find_range(vm_page_t bounds[], int segind, int domain, 1241 u_long npages, vm_paddr_t low, vm_paddr_t high) 1242 { 1243 vm_paddr_t pa_end, pa_start; 1244 struct vm_phys_seg *end_seg, *seg; 1245 1246 KASSERT(npages > 0, ("npages is zero")); 1247 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range")); 1248 end_seg = &vm_phys_segs[vm_phys_nsegs]; 1249 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) { 1250 if (seg->domain != domain) 1251 continue; 1252 if (seg->start >= high) 1253 return (-1); 1254 pa_start = MAX(low, seg->start); 1255 pa_end = MIN(high, seg->end); 1256 if (pa_end - pa_start < ptoa(npages)) 1257 continue; 1258 bounds[0] = &seg->first_page[atop(pa_start - seg->start)]; 1259 bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; 1260 return (seg - vm_phys_segs); 1261 } 1262 return (-1); 1263 } 1264 1265 /* 1266 * Search for the given physical page "m" in the free lists. If the search 1267 * succeeds, remove "m" from the free lists and return true. Otherwise, return 1268 * false, indicating that "m" is not in the free lists. 1269 * 1270 * The free page queues must be locked. 1271 */ 1272 bool 1273 vm_phys_unfree_page(vm_page_t m) 1274 { 1275 struct vm_freelist *fl; 1276 struct vm_phys_seg *seg; 1277 vm_paddr_t pa, pa_half; 1278 vm_page_t m_set, m_tmp; 1279 int order; 1280 1281 /* 1282 * First, find the contiguous, power of two-sized set of free 1283 * physical pages containing the given physical page "m" and 1284 * assign it to "m_set". 1285 */ 1286 seg = &vm_phys_segs[m->segind]; 1287 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); 1288 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1289 order < VM_NFREEORDER - 1; ) { 1290 order++; 1291 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1292 if (pa >= seg->start) 1293 m_set = &seg->first_page[atop(pa - seg->start)]; 1294 else 1295 return (false); 1296 } 1297 if (m_set->order < order) 1298 return (false); 1299 if (m_set->order == VM_NFREEORDER) 1300 return (false); 1301 KASSERT(m_set->order < VM_NFREEORDER, 1302 ("vm_phys_unfree_page: page %p has unexpected order %d", 1303 m_set, m_set->order)); 1304 1305 /* 1306 * Next, remove "m_set" from the free lists. Finally, extract 1307 * "m" from "m_set" using an iterative algorithm: While "m_set" 1308 * is larger than a page, shrink "m_set" by returning the half 1309 * of "m_set" that does not contain "m" to the free lists. 1310 */ 1311 fl = (*seg->free_queues)[m_set->pool]; 1312 order = m_set->order; 1313 vm_freelist_rem(fl, m_set, order); 1314 while (order > 0) { 1315 order--; 1316 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1317 if (m->phys_addr < pa_half) 1318 m_tmp = &seg->first_page[atop(pa_half - seg->start)]; 1319 else { 1320 m_tmp = m_set; 1321 m_set = &seg->first_page[atop(pa_half - seg->start)]; 1322 } 1323 vm_freelist_add(fl, m_tmp, order, 0); 1324 } 1325 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1326 return (true); 1327 } 1328 1329 /* 1330 * Find a run of contiguous physical pages from the specified page list. 1331 */ 1332 static vm_page_t 1333 vm_phys_find_freelist_contig(struct vm_freelist *fl, int oind, u_long npages, 1334 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1335 { 1336 struct vm_phys_seg *seg; 1337 vm_paddr_t frag, lbound, pa, page_size, pa_end, pa_pre, size; 1338 vm_page_t m, m_listed, m_ret; 1339 int order; 1340 1341 KASSERT(npages > 0, ("npages is 0")); 1342 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1343 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1344 /* Search for a run satisfying the specified conditions. */ 1345 page_size = PAGE_SIZE; 1346 size = npages << PAGE_SHIFT; 1347 frag = (npages & ~(~0UL << oind)) << PAGE_SHIFT; 1348 TAILQ_FOREACH(m_listed, &fl[oind].pl, listq) { 1349 /* 1350 * Determine if the address range starting at pa is 1351 * too low. 1352 */ 1353 pa = VM_PAGE_TO_PHYS(m_listed); 1354 if (pa < low) 1355 continue; 1356 1357 /* 1358 * If this is not the first free oind-block in this range, bail 1359 * out. We have seen the first free block already, or will see 1360 * it before failing to find an appropriate range. 1361 */ 1362 seg = &vm_phys_segs[m_listed->segind]; 1363 lbound = low > seg->start ? low : seg->start; 1364 pa_pre = pa - (page_size << oind); 1365 m = &seg->first_page[atop(pa_pre - seg->start)]; 1366 if (pa != 0 && pa_pre >= lbound && m->order == oind) 1367 continue; 1368 1369 if (!vm_addr_align_ok(pa, alignment)) 1370 /* Advance to satisfy alignment condition. */ 1371 pa = roundup2(pa, alignment); 1372 else if (frag != 0 && lbound + frag <= pa) { 1373 /* 1374 * Back up to the first aligned free block in this 1375 * range, without moving below lbound. 1376 */ 1377 pa_end = pa; 1378 for (order = oind - 1; order >= 0; order--) { 1379 pa_pre = pa_end - (page_size << order); 1380 if (!vm_addr_align_ok(pa_pre, alignment)) 1381 break; 1382 m = &seg->first_page[atop(pa_pre - seg->start)]; 1383 if (pa_pre >= lbound && m->order == order) 1384 pa_end = pa_pre; 1385 } 1386 /* 1387 * If the extra small blocks are enough to complete the 1388 * fragment, use them. Otherwise, look to allocate the 1389 * fragment at the other end. 1390 */ 1391 if (pa_end + frag <= pa) 1392 pa = pa_end; 1393 } 1394 1395 /* Advance as necessary to satisfy boundary conditions. */ 1396 if (!vm_addr_bound_ok(pa, size, boundary)) 1397 pa = roundup2(pa + 1, boundary); 1398 pa_end = pa + size; 1399 1400 /* 1401 * Determine if the address range is valid (without overflow in 1402 * pa_end calculation), and fits within the segment. 1403 */ 1404 if (pa_end < pa || seg->end < pa_end) 1405 continue; 1406 1407 m_ret = &seg->first_page[atop(pa - seg->start)]; 1408 1409 /* 1410 * Determine whether there are enough free oind-blocks here to 1411 * satisfy the allocation request. 1412 */ 1413 pa = VM_PAGE_TO_PHYS(m_listed); 1414 do { 1415 pa += page_size << oind; 1416 if (pa >= pa_end) 1417 return (m_ret); 1418 m = &seg->first_page[atop(pa - seg->start)]; 1419 } while (oind == m->order); 1420 1421 /* 1422 * Determine if an additional series of free blocks of 1423 * diminishing size can help to satisfy the allocation request. 1424 */ 1425 while (m->order < oind && 1426 pa + 2 * (page_size << m->order) > pa_end) { 1427 pa += page_size << m->order; 1428 if (pa >= pa_end) 1429 return (m_ret); 1430 m = &seg->first_page[atop(pa - seg->start)]; 1431 } 1432 } 1433 return (NULL); 1434 } 1435 1436 /* 1437 * Find a run of contiguous physical pages from the specified free list 1438 * table. 1439 */ 1440 static vm_page_t 1441 vm_phys_find_queues_contig( 1442 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], 1443 u_long npages, vm_paddr_t low, vm_paddr_t high, 1444 u_long alignment, vm_paddr_t boundary) 1445 { 1446 struct vm_freelist *fl; 1447 vm_page_t m_ret; 1448 vm_paddr_t pa, pa_end, size; 1449 int oind, order, pind; 1450 1451 KASSERT(npages > 0, ("npages is 0")); 1452 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1453 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1454 /* Compute the queue that is the best fit for npages. */ 1455 order = flsl(npages - 1); 1456 /* Search for a large enough free block. */ 1457 size = npages << PAGE_SHIFT; 1458 for (oind = order; oind < VM_NFREEORDER; oind++) { 1459 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1460 fl = (*queues)[pind]; 1461 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { 1462 /* 1463 * Determine if the address range starting at pa 1464 * is within the given range, satisfies the 1465 * given alignment, and does not cross the given 1466 * boundary. 1467 */ 1468 pa = VM_PAGE_TO_PHYS(m_ret); 1469 pa_end = pa + size; 1470 if (low <= pa && pa_end <= high && 1471 vm_addr_ok(pa, size, alignment, boundary)) 1472 return (m_ret); 1473 } 1474 } 1475 } 1476 if (order < VM_NFREEORDER) 1477 return (NULL); 1478 /* Search for a long-enough sequence of small blocks. */ 1479 oind = VM_NFREEORDER - 1; 1480 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1481 fl = (*queues)[pind]; 1482 m_ret = vm_phys_find_freelist_contig(fl, oind, npages, 1483 low, high, alignment, boundary); 1484 if (m_ret != NULL) 1485 return (m_ret); 1486 } 1487 return (NULL); 1488 } 1489 1490 /* 1491 * Allocate a contiguous set of physical pages of the given size 1492 * "npages" from the free lists. All of the physical pages must be at 1493 * or above the given physical address "low" and below the given 1494 * physical address "high". The given value "alignment" determines the 1495 * alignment of the first physical page in the set. If the given value 1496 * "boundary" is non-zero, then the set of physical pages cannot cross 1497 * any physical address boundary that is a multiple of that value. Both 1498 * "alignment" and "boundary" must be a power of two. 1499 */ 1500 vm_page_t 1501 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1502 u_long alignment, vm_paddr_t boundary) 1503 { 1504 vm_paddr_t pa_end, pa_start; 1505 struct vm_freelist *fl; 1506 vm_page_t m, m_run; 1507 struct vm_phys_seg *seg; 1508 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; 1509 int oind, segind; 1510 1511 KASSERT(npages > 0, ("npages is 0")); 1512 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1513 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1514 vm_domain_free_assert_locked(VM_DOMAIN(domain)); 1515 if (low >= high) 1516 return (NULL); 1517 queues = NULL; 1518 m_run = NULL; 1519 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1520 seg = &vm_phys_segs[segind]; 1521 if (seg->start >= high || seg->domain != domain) 1522 continue; 1523 if (low >= seg->end) 1524 break; 1525 if (low <= seg->start) 1526 pa_start = seg->start; 1527 else 1528 pa_start = low; 1529 if (high < seg->end) 1530 pa_end = high; 1531 else 1532 pa_end = seg->end; 1533 if (pa_end - pa_start < ptoa(npages)) 1534 continue; 1535 /* 1536 * If a previous segment led to a search using 1537 * the same free lists as would this segment, then 1538 * we've actually already searched within this 1539 * too. So skip it. 1540 */ 1541 if (seg->free_queues == queues) 1542 continue; 1543 queues = seg->free_queues; 1544 m_run = vm_phys_find_queues_contig(queues, npages, 1545 low, high, alignment, boundary); 1546 if (m_run != NULL) 1547 break; 1548 } 1549 if (m_run == NULL) 1550 return (NULL); 1551 1552 /* Allocate pages from the page-range found. */ 1553 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { 1554 fl = (*queues)[m->pool]; 1555 oind = m->order; 1556 vm_freelist_rem(fl, m, oind); 1557 if (m->pool != VM_FREEPOOL_DEFAULT) 1558 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); 1559 } 1560 /* Return excess pages to the free lists. */ 1561 fl = (*queues)[VM_FREEPOOL_DEFAULT]; 1562 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0); 1563 return (m_run); 1564 } 1565 1566 /* 1567 * Return the index of the first unused slot which may be the terminating 1568 * entry. 1569 */ 1570 static int 1571 vm_phys_avail_count(void) 1572 { 1573 int i; 1574 1575 for (i = 0; phys_avail[i + 1]; i += 2) 1576 continue; 1577 if (i > PHYS_AVAIL_ENTRIES) 1578 panic("Improperly terminated phys_avail %d entries", i); 1579 1580 return (i); 1581 } 1582 1583 /* 1584 * Assert that a phys_avail entry is valid. 1585 */ 1586 static void 1587 vm_phys_avail_check(int i) 1588 { 1589 if (phys_avail[i] & PAGE_MASK) 1590 panic("Unaligned phys_avail[%d]: %#jx", i, 1591 (intmax_t)phys_avail[i]); 1592 if (phys_avail[i+1] & PAGE_MASK) 1593 panic("Unaligned phys_avail[%d + 1]: %#jx", i, 1594 (intmax_t)phys_avail[i]); 1595 if (phys_avail[i + 1] < phys_avail[i]) 1596 panic("phys_avail[%d] start %#jx < end %#jx", i, 1597 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); 1598 } 1599 1600 /* 1601 * Return the index of an overlapping phys_avail entry or -1. 1602 */ 1603 #ifdef NUMA 1604 static int 1605 vm_phys_avail_find(vm_paddr_t pa) 1606 { 1607 int i; 1608 1609 for (i = 0; phys_avail[i + 1]; i += 2) 1610 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) 1611 return (i); 1612 return (-1); 1613 } 1614 #endif 1615 1616 /* 1617 * Return the index of the largest entry. 1618 */ 1619 int 1620 vm_phys_avail_largest(void) 1621 { 1622 vm_paddr_t sz, largesz; 1623 int largest; 1624 int i; 1625 1626 largest = 0; 1627 largesz = 0; 1628 for (i = 0; phys_avail[i + 1]; i += 2) { 1629 sz = vm_phys_avail_size(i); 1630 if (sz > largesz) { 1631 largesz = sz; 1632 largest = i; 1633 } 1634 } 1635 1636 return (largest); 1637 } 1638 1639 vm_paddr_t 1640 vm_phys_avail_size(int i) 1641 { 1642 1643 return (phys_avail[i + 1] - phys_avail[i]); 1644 } 1645 1646 /* 1647 * Split an entry at the address 'pa'. Return zero on success or errno. 1648 */ 1649 static int 1650 vm_phys_avail_split(vm_paddr_t pa, int i) 1651 { 1652 int cnt; 1653 1654 vm_phys_avail_check(i); 1655 if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) 1656 panic("vm_phys_avail_split: invalid address"); 1657 cnt = vm_phys_avail_count(); 1658 if (cnt >= PHYS_AVAIL_ENTRIES) 1659 return (ENOSPC); 1660 memmove(&phys_avail[i + 2], &phys_avail[i], 1661 (cnt - i) * sizeof(phys_avail[0])); 1662 phys_avail[i + 1] = pa; 1663 phys_avail[i + 2] = pa; 1664 vm_phys_avail_check(i); 1665 vm_phys_avail_check(i+2); 1666 1667 return (0); 1668 } 1669 1670 /* 1671 * Check if a given physical address can be included as part of a crash dump. 1672 */ 1673 bool 1674 vm_phys_is_dumpable(vm_paddr_t pa) 1675 { 1676 vm_page_t m; 1677 int i; 1678 1679 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) 1680 return ((m->flags & PG_NODUMP) == 0); 1681 1682 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { 1683 if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) 1684 return (true); 1685 } 1686 return (false); 1687 } 1688 1689 void 1690 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) 1691 { 1692 struct vm_phys_seg *seg; 1693 1694 if (vm_phys_early_nsegs == -1) 1695 panic("%s: called after initialization", __func__); 1696 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) 1697 panic("%s: ran out of early segments", __func__); 1698 1699 seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; 1700 seg->start = start; 1701 seg->end = end; 1702 } 1703 1704 /* 1705 * This routine allocates NUMA node specific memory before the page 1706 * allocator is bootstrapped. 1707 */ 1708 vm_paddr_t 1709 vm_phys_early_alloc(int domain, size_t alloc_size) 1710 { 1711 #ifdef NUMA 1712 int mem_index; 1713 #endif 1714 int i, biggestone; 1715 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; 1716 1717 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), 1718 ("%s: invalid domain index %d", __func__, domain)); 1719 1720 /* 1721 * Search the mem_affinity array for the biggest address 1722 * range in the desired domain. This is used to constrain 1723 * the phys_avail selection below. 1724 */ 1725 biggestsize = 0; 1726 mem_start = 0; 1727 mem_end = -1; 1728 #ifdef NUMA 1729 mem_index = 0; 1730 if (mem_affinity != NULL) { 1731 for (i = 0;; i++) { 1732 size = mem_affinity[i].end - mem_affinity[i].start; 1733 if (size == 0) 1734 break; 1735 if (domain != -1 && mem_affinity[i].domain != domain) 1736 continue; 1737 if (size > biggestsize) { 1738 mem_index = i; 1739 biggestsize = size; 1740 } 1741 } 1742 mem_start = mem_affinity[mem_index].start; 1743 mem_end = mem_affinity[mem_index].end; 1744 } 1745 #endif 1746 1747 /* 1748 * Now find biggest physical segment in within the desired 1749 * numa domain. 1750 */ 1751 biggestsize = 0; 1752 biggestone = 0; 1753 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1754 /* skip regions that are out of range */ 1755 if (phys_avail[i+1] - alloc_size < mem_start || 1756 phys_avail[i+1] > mem_end) 1757 continue; 1758 size = vm_phys_avail_size(i); 1759 if (size > biggestsize) { 1760 biggestone = i; 1761 biggestsize = size; 1762 } 1763 } 1764 alloc_size = round_page(alloc_size); 1765 1766 /* 1767 * Grab single pages from the front to reduce fragmentation. 1768 */ 1769 if (alloc_size == PAGE_SIZE) { 1770 pa = phys_avail[biggestone]; 1771 phys_avail[biggestone] += PAGE_SIZE; 1772 vm_phys_avail_check(biggestone); 1773 return (pa); 1774 } 1775 1776 /* 1777 * Naturally align large allocations. 1778 */ 1779 align = phys_avail[biggestone + 1] & (alloc_size - 1); 1780 if (alloc_size + align > biggestsize) 1781 panic("cannot find a large enough size\n"); 1782 if (align != 0 && 1783 vm_phys_avail_split(phys_avail[biggestone + 1] - align, 1784 biggestone) != 0) 1785 /* Wasting memory. */ 1786 phys_avail[biggestone + 1] -= align; 1787 1788 phys_avail[biggestone + 1] -= alloc_size; 1789 vm_phys_avail_check(biggestone); 1790 pa = phys_avail[biggestone + 1]; 1791 return (pa); 1792 } 1793 1794 void 1795 vm_phys_early_startup(void) 1796 { 1797 struct vm_phys_seg *seg; 1798 int i; 1799 1800 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1801 phys_avail[i] = round_page(phys_avail[i]); 1802 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 1803 } 1804 1805 for (i = 0; i < vm_phys_early_nsegs; i++) { 1806 seg = &vm_phys_early_segs[i]; 1807 vm_phys_add_seg(seg->start, seg->end); 1808 } 1809 vm_phys_early_nsegs = -1; 1810 1811 #ifdef NUMA 1812 /* Force phys_avail to be split by domain. */ 1813 if (mem_affinity != NULL) { 1814 int idx; 1815 1816 for (i = 0; mem_affinity[i].end != 0; i++) { 1817 idx = vm_phys_avail_find(mem_affinity[i].start); 1818 if (idx != -1 && 1819 phys_avail[idx] != mem_affinity[i].start) 1820 vm_phys_avail_split(mem_affinity[i].start, idx); 1821 idx = vm_phys_avail_find(mem_affinity[i].end); 1822 if (idx != -1 && 1823 phys_avail[idx] != mem_affinity[i].end) 1824 vm_phys_avail_split(mem_affinity[i].end, idx); 1825 } 1826 } 1827 #endif 1828 } 1829 1830 #ifdef DDB 1831 /* 1832 * Show the number of physical pages in each of the free lists. 1833 */ 1834 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) 1835 { 1836 struct vm_freelist *fl; 1837 int flind, oind, pind, dom; 1838 1839 for (dom = 0; dom < vm_ndomains; dom++) { 1840 db_printf("DOMAIN: %d\n", dom); 1841 for (flind = 0; flind < vm_nfreelists; flind++) { 1842 db_printf("FREE LIST %d:\n" 1843 "\n ORDER (SIZE) | NUMBER" 1844 "\n ", flind); 1845 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1846 db_printf(" | POOL %d", pind); 1847 db_printf("\n-- "); 1848 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1849 db_printf("-- -- "); 1850 db_printf("--\n"); 1851 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 1852 db_printf(" %2.2d (%6.6dK)", oind, 1853 1 << (PAGE_SHIFT - 10 + oind)); 1854 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1855 fl = vm_phys_free_queues[dom][flind][pind]; 1856 db_printf(" | %6.6d", fl[oind].lcnt); 1857 } 1858 db_printf("\n"); 1859 } 1860 db_printf("\n"); 1861 } 1862 db_printf("\n"); 1863 } 1864 } 1865 #endif 1866