1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Physical memory system implementation 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ddb.h" 45 #include "opt_vm.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/lock.h> 50 #include <sys/kernel.h> 51 #include <sys/malloc.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/queue.h> 55 #include <sys/rwlock.h> 56 #include <sys/sbuf.h> 57 #include <sys/sysctl.h> 58 #include <sys/tree.h> 59 #include <sys/vmmeter.h> 60 #include <sys/seq.h> 61 62 #include <ddb/ddb.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_param.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_page.h> 69 #include <vm/vm_phys.h> 70 71 #include <vm/vm_domain.h> 72 73 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, 74 "Too many physsegs."); 75 76 #ifdef VM_NUMA_ALLOC 77 struct mem_affinity *mem_affinity; 78 int *mem_locality; 79 #endif 80 81 int vm_ndomains = 1; 82 83 struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX]; 84 int vm_phys_nsegs; 85 86 struct vm_phys_fictitious_seg; 87 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, 88 struct vm_phys_fictitious_seg *); 89 90 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = 91 RB_INITIALIZER(_vm_phys_fictitious_tree); 92 93 struct vm_phys_fictitious_seg { 94 RB_ENTRY(vm_phys_fictitious_seg) node; 95 /* Memory region data */ 96 vm_paddr_t start; 97 vm_paddr_t end; 98 vm_page_t first_page; 99 }; 100 101 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, 102 vm_phys_fictitious_cmp); 103 104 static struct rwlock vm_phys_fictitious_reg_lock; 105 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); 106 107 static struct vm_freelist 108 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER]; 109 110 static int vm_nfreelists; 111 112 /* 113 * Provides the mapping from VM_FREELIST_* to free list indices (flind). 114 */ 115 static int vm_freelist_to_flind[VM_NFREELIST]; 116 117 CTASSERT(VM_FREELIST_DEFAULT == 0); 118 119 #ifdef VM_FREELIST_ISADMA 120 #define VM_ISADMA_BOUNDARY 16777216 121 #endif 122 #ifdef VM_FREELIST_DMA32 123 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) 124 #endif 125 126 /* 127 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about 128 * the ordering of the free list boundaries. 129 */ 130 #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY) 131 CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY); 132 #endif 133 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) 134 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); 135 #endif 136 137 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); 138 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD, 139 NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info"); 140 141 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); 142 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, 143 NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); 144 145 #ifdef VM_NUMA_ALLOC 146 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); 147 SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD, 148 NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info"); 149 #endif 150 151 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, 152 &vm_ndomains, 0, "Number of physical memory domains available."); 153 154 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, 155 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 156 vm_paddr_t boundary); 157 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); 158 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); 159 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, 160 int order); 161 162 /* 163 * Red-black tree helpers for vm fictitious range management. 164 */ 165 static inline int 166 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, 167 struct vm_phys_fictitious_seg *range) 168 { 169 170 KASSERT(range->start != 0 && range->end != 0, 171 ("Invalid range passed on search for vm_fictitious page")); 172 if (p->start >= range->end) 173 return (1); 174 if (p->start < range->start) 175 return (-1); 176 177 return (0); 178 } 179 180 static int 181 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, 182 struct vm_phys_fictitious_seg *p2) 183 { 184 185 /* Check if this is a search for a page */ 186 if (p1->end == 0) 187 return (vm_phys_fictitious_in_range(p1, p2)); 188 189 KASSERT(p2->end != 0, 190 ("Invalid range passed as second parameter to vm fictitious comparison")); 191 192 /* Searching to add a new range */ 193 if (p1->end <= p2->start) 194 return (-1); 195 if (p1->start >= p2->end) 196 return (1); 197 198 panic("Trying to add overlapping vm fictitious ranges:\n" 199 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, 200 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); 201 } 202 203 boolean_t 204 vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high) 205 { 206 struct vm_phys_seg *s; 207 int idx; 208 209 while ((idx = ffsl(mask)) != 0) { 210 idx--; /* ffsl counts from 1 */ 211 mask &= ~(1UL << idx); 212 s = &vm_phys_segs[idx]; 213 if (low < s->end && high > s->start) 214 return (TRUE); 215 } 216 return (FALSE); 217 } 218 219 /* 220 * Outputs the state of the physical memory allocator, specifically, 221 * the amount of physical memory in each free list. 222 */ 223 static int 224 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) 225 { 226 struct sbuf sbuf; 227 struct vm_freelist *fl; 228 int dom, error, flind, oind, pind; 229 230 error = sysctl_wire_old_buffer(req, 0); 231 if (error != 0) 232 return (error); 233 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); 234 for (dom = 0; dom < vm_ndomains; dom++) { 235 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); 236 for (flind = 0; flind < vm_nfreelists; flind++) { 237 sbuf_printf(&sbuf, "\nFREE LIST %d:\n" 238 "\n ORDER (SIZE) | NUMBER" 239 "\n ", flind); 240 for (pind = 0; pind < VM_NFREEPOOL; pind++) 241 sbuf_printf(&sbuf, " | POOL %d", pind); 242 sbuf_printf(&sbuf, "\n-- "); 243 for (pind = 0; pind < VM_NFREEPOOL; pind++) 244 sbuf_printf(&sbuf, "-- -- "); 245 sbuf_printf(&sbuf, "--\n"); 246 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 247 sbuf_printf(&sbuf, " %2d (%6dK)", oind, 248 1 << (PAGE_SHIFT - 10 + oind)); 249 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 250 fl = vm_phys_free_queues[dom][flind][pind]; 251 sbuf_printf(&sbuf, " | %6d", 252 fl[oind].lcnt); 253 } 254 sbuf_printf(&sbuf, "\n"); 255 } 256 } 257 } 258 error = sbuf_finish(&sbuf); 259 sbuf_delete(&sbuf); 260 return (error); 261 } 262 263 /* 264 * Outputs the set of physical memory segments. 265 */ 266 static int 267 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) 268 { 269 struct sbuf sbuf; 270 struct vm_phys_seg *seg; 271 int error, segind; 272 273 error = sysctl_wire_old_buffer(req, 0); 274 if (error != 0) 275 return (error); 276 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 277 for (segind = 0; segind < vm_phys_nsegs; segind++) { 278 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); 279 seg = &vm_phys_segs[segind]; 280 sbuf_printf(&sbuf, "start: %#jx\n", 281 (uintmax_t)seg->start); 282 sbuf_printf(&sbuf, "end: %#jx\n", 283 (uintmax_t)seg->end); 284 sbuf_printf(&sbuf, "domain: %d\n", seg->domain); 285 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); 286 } 287 error = sbuf_finish(&sbuf); 288 sbuf_delete(&sbuf); 289 return (error); 290 } 291 292 /* 293 * Return affinity, or -1 if there's no affinity information. 294 */ 295 int 296 vm_phys_mem_affinity(int f, int t) 297 { 298 299 #ifdef VM_NUMA_ALLOC 300 if (mem_locality == NULL) 301 return (-1); 302 if (f >= vm_ndomains || t >= vm_ndomains) 303 return (-1); 304 return (mem_locality[f * vm_ndomains + t]); 305 #else 306 return (-1); 307 #endif 308 } 309 310 #ifdef VM_NUMA_ALLOC 311 /* 312 * Outputs the VM locality table. 313 */ 314 static int 315 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) 316 { 317 struct sbuf sbuf; 318 int error, i, j; 319 320 error = sysctl_wire_old_buffer(req, 0); 321 if (error != 0) 322 return (error); 323 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 324 325 sbuf_printf(&sbuf, "\n"); 326 327 for (i = 0; i < vm_ndomains; i++) { 328 sbuf_printf(&sbuf, "%d: ", i); 329 for (j = 0; j < vm_ndomains; j++) { 330 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); 331 } 332 sbuf_printf(&sbuf, "\n"); 333 } 334 error = sbuf_finish(&sbuf); 335 sbuf_delete(&sbuf); 336 return (error); 337 } 338 #endif 339 340 static void 341 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) 342 { 343 344 m->order = order; 345 if (tail) 346 TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q); 347 else 348 TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q); 349 fl[order].lcnt++; 350 } 351 352 static void 353 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) 354 { 355 356 TAILQ_REMOVE(&fl[order].pl, m, plinks.q); 357 fl[order].lcnt--; 358 m->order = VM_NFREEORDER; 359 } 360 361 /* 362 * Create a physical memory segment. 363 */ 364 static void 365 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) 366 { 367 struct vm_phys_seg *seg; 368 369 KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, 370 ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); 371 KASSERT(domain >= 0 && domain < vm_ndomains, 372 ("vm_phys_create_seg: invalid domain provided")); 373 seg = &vm_phys_segs[vm_phys_nsegs++]; 374 while (seg > vm_phys_segs && (seg - 1)->start >= end) { 375 *seg = *(seg - 1); 376 seg--; 377 } 378 seg->start = start; 379 seg->end = end; 380 seg->domain = domain; 381 } 382 383 static void 384 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) 385 { 386 #ifdef VM_NUMA_ALLOC 387 int i; 388 389 if (mem_affinity == NULL) { 390 _vm_phys_create_seg(start, end, 0); 391 return; 392 } 393 394 for (i = 0;; i++) { 395 if (mem_affinity[i].end == 0) 396 panic("Reached end of affinity info"); 397 if (mem_affinity[i].end <= start) 398 continue; 399 if (mem_affinity[i].start > start) 400 panic("No affinity info for start %jx", 401 (uintmax_t)start); 402 if (mem_affinity[i].end >= end) { 403 _vm_phys_create_seg(start, end, 404 mem_affinity[i].domain); 405 break; 406 } 407 _vm_phys_create_seg(start, mem_affinity[i].end, 408 mem_affinity[i].domain); 409 start = mem_affinity[i].end; 410 } 411 #else 412 _vm_phys_create_seg(start, end, 0); 413 #endif 414 } 415 416 /* 417 * Add a physical memory segment. 418 */ 419 void 420 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) 421 { 422 vm_paddr_t paddr; 423 424 KASSERT((start & PAGE_MASK) == 0, 425 ("vm_phys_define_seg: start is not page aligned")); 426 KASSERT((end & PAGE_MASK) == 0, 427 ("vm_phys_define_seg: end is not page aligned")); 428 429 /* 430 * Split the physical memory segment if it spans two or more free 431 * list boundaries. 432 */ 433 paddr = start; 434 #ifdef VM_FREELIST_ISADMA 435 if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) { 436 vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY); 437 paddr = VM_ISADMA_BOUNDARY; 438 } 439 #endif 440 #ifdef VM_FREELIST_LOWMEM 441 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { 442 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); 443 paddr = VM_LOWMEM_BOUNDARY; 444 } 445 #endif 446 #ifdef VM_FREELIST_DMA32 447 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { 448 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); 449 paddr = VM_DMA32_BOUNDARY; 450 } 451 #endif 452 vm_phys_create_seg(paddr, end); 453 } 454 455 /* 456 * Initialize the physical memory allocator. 457 * 458 * Requires that vm_page_array is initialized! 459 */ 460 void 461 vm_phys_init(void) 462 { 463 struct vm_freelist *fl; 464 struct vm_phys_seg *seg; 465 u_long npages; 466 int dom, flind, freelist, oind, pind, segind; 467 468 /* 469 * Compute the number of free lists, and generate the mapping from the 470 * manifest constants VM_FREELIST_* to the free list indices. 471 * 472 * Initially, the entries of vm_freelist_to_flind[] are set to either 473 * 0 or 1 to indicate which free lists should be created. 474 */ 475 npages = 0; 476 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 477 seg = &vm_phys_segs[segind]; 478 #ifdef VM_FREELIST_ISADMA 479 if (seg->end <= VM_ISADMA_BOUNDARY) 480 vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1; 481 else 482 #endif 483 #ifdef VM_FREELIST_LOWMEM 484 if (seg->end <= VM_LOWMEM_BOUNDARY) 485 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; 486 else 487 #endif 488 #ifdef VM_FREELIST_DMA32 489 if ( 490 #ifdef VM_DMA32_NPAGES_THRESHOLD 491 /* 492 * Create the DMA32 free list only if the amount of 493 * physical memory above physical address 4G exceeds the 494 * given threshold. 495 */ 496 npages > VM_DMA32_NPAGES_THRESHOLD && 497 #endif 498 seg->end <= VM_DMA32_BOUNDARY) 499 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; 500 else 501 #endif 502 { 503 npages += atop(seg->end - seg->start); 504 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; 505 } 506 } 507 /* Change each entry into a running total of the free lists. */ 508 for (freelist = 1; freelist < VM_NFREELIST; freelist++) { 509 vm_freelist_to_flind[freelist] += 510 vm_freelist_to_flind[freelist - 1]; 511 } 512 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; 513 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); 514 /* Change each entry into a free list index. */ 515 for (freelist = 0; freelist < VM_NFREELIST; freelist++) 516 vm_freelist_to_flind[freelist]--; 517 518 /* 519 * Initialize the first_page and free_queues fields of each physical 520 * memory segment. 521 */ 522 #ifdef VM_PHYSSEG_SPARSE 523 npages = 0; 524 #endif 525 for (segind = 0; segind < vm_phys_nsegs; segind++) { 526 seg = &vm_phys_segs[segind]; 527 #ifdef VM_PHYSSEG_SPARSE 528 seg->first_page = &vm_page_array[npages]; 529 npages += atop(seg->end - seg->start); 530 #else 531 seg->first_page = PHYS_TO_VM_PAGE(seg->start); 532 #endif 533 #ifdef VM_FREELIST_ISADMA 534 if (seg->end <= VM_ISADMA_BOUNDARY) { 535 flind = vm_freelist_to_flind[VM_FREELIST_ISADMA]; 536 KASSERT(flind >= 0, 537 ("vm_phys_init: ISADMA flind < 0")); 538 } else 539 #endif 540 #ifdef VM_FREELIST_LOWMEM 541 if (seg->end <= VM_LOWMEM_BOUNDARY) { 542 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; 543 KASSERT(flind >= 0, 544 ("vm_phys_init: LOWMEM flind < 0")); 545 } else 546 #endif 547 #ifdef VM_FREELIST_DMA32 548 if (seg->end <= VM_DMA32_BOUNDARY) { 549 flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; 550 KASSERT(flind >= 0, 551 ("vm_phys_init: DMA32 flind < 0")); 552 } else 553 #endif 554 { 555 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; 556 KASSERT(flind >= 0, 557 ("vm_phys_init: DEFAULT flind < 0")); 558 } 559 seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; 560 } 561 562 /* 563 * Initialize the free queues. 564 */ 565 for (dom = 0; dom < vm_ndomains; dom++) { 566 for (flind = 0; flind < vm_nfreelists; flind++) { 567 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 568 fl = vm_phys_free_queues[dom][flind][pind]; 569 for (oind = 0; oind < VM_NFREEORDER; oind++) 570 TAILQ_INIT(&fl[oind].pl); 571 } 572 } 573 } 574 575 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); 576 } 577 578 /* 579 * Split a contiguous, power of two-sized set of physical pages. 580 */ 581 static __inline void 582 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order) 583 { 584 vm_page_t m_buddy; 585 586 while (oind > order) { 587 oind--; 588 m_buddy = &m[1 << oind]; 589 KASSERT(m_buddy->order == VM_NFREEORDER, 590 ("vm_phys_split_pages: page %p has unexpected order %d", 591 m_buddy, m_buddy->order)); 592 vm_freelist_add(fl, m_buddy, oind, 0); 593 } 594 } 595 596 /* 597 * Allocate a contiguous, power of two-sized set of physical pages 598 * from the free lists. 599 * 600 * The free page queues must be locked. 601 */ 602 vm_page_t 603 vm_phys_alloc_pages(int domain, int pool, int order) 604 { 605 vm_page_t m; 606 int flind; 607 608 for (flind = 0; flind < vm_nfreelists; flind++) { 609 m = vm_phys_alloc_freelist_pages(domain, flind, pool, order); 610 if (m != NULL) 611 return (m); 612 } 613 return (NULL); 614 } 615 616 /* 617 * Allocate a contiguous, power of two-sized set of physical pages from the 618 * specified free list. The free list must be specified using one of the 619 * manifest constants VM_FREELIST_*. 620 * 621 * The free page queues must be locked. 622 */ 623 vm_page_t 624 vm_phys_alloc_freelist_pages(int domain, int flind, int pool, int order) 625 { 626 struct vm_freelist *alt, *fl; 627 vm_page_t m; 628 int oind, pind; 629 630 KASSERT(domain >= 0 && domain < vm_ndomains, 631 ("vm_phys_alloc_freelist_pages: domain %d is out of range", 632 domain)); 633 KASSERT(flind < VM_NFREELIST, 634 ("vm_phys_alloc_freelist_pages: freelist %d is out of range", 635 flind)); 636 KASSERT(pool < VM_NFREEPOOL, 637 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); 638 KASSERT(order < VM_NFREEORDER, 639 ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); 640 641 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 642 fl = &vm_phys_free_queues[domain][flind][pool][0]; 643 for (oind = order; oind < VM_NFREEORDER; oind++) { 644 m = TAILQ_FIRST(&fl[oind].pl); 645 if (m != NULL) { 646 vm_freelist_rem(fl, m, oind); 647 vm_phys_split_pages(m, oind, fl, order); 648 return (m); 649 } 650 } 651 652 /* 653 * The given pool was empty. Find the largest 654 * contiguous, power-of-two-sized set of pages in any 655 * pool. Transfer these pages to the given pool, and 656 * use them to satisfy the allocation. 657 */ 658 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { 659 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 660 alt = &vm_phys_free_queues[domain][flind][pind][0]; 661 m = TAILQ_FIRST(&alt[oind].pl); 662 if (m != NULL) { 663 vm_freelist_rem(alt, m, oind); 664 vm_phys_set_pool(pool, m, oind); 665 vm_phys_split_pages(m, oind, fl, order); 666 return (m); 667 } 668 } 669 } 670 return (NULL); 671 } 672 673 /* 674 * Find the vm_page corresponding to the given physical address. 675 */ 676 vm_page_t 677 vm_phys_paddr_to_vm_page(vm_paddr_t pa) 678 { 679 struct vm_phys_seg *seg; 680 int segind; 681 682 for (segind = 0; segind < vm_phys_nsegs; segind++) { 683 seg = &vm_phys_segs[segind]; 684 if (pa >= seg->start && pa < seg->end) 685 return (&seg->first_page[atop(pa - seg->start)]); 686 } 687 return (NULL); 688 } 689 690 vm_page_t 691 vm_phys_fictitious_to_vm_page(vm_paddr_t pa) 692 { 693 struct vm_phys_fictitious_seg tmp, *seg; 694 vm_page_t m; 695 696 m = NULL; 697 tmp.start = pa; 698 tmp.end = 0; 699 700 rw_rlock(&vm_phys_fictitious_reg_lock); 701 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 702 rw_runlock(&vm_phys_fictitious_reg_lock); 703 if (seg == NULL) 704 return (NULL); 705 706 m = &seg->first_page[atop(pa - seg->start)]; 707 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); 708 709 return (m); 710 } 711 712 static inline void 713 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, 714 long page_count, vm_memattr_t memattr) 715 { 716 long i; 717 718 bzero(range, page_count * sizeof(*range)); 719 for (i = 0; i < page_count; i++) { 720 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); 721 range[i].oflags &= ~VPO_UNMANAGED; 722 range[i].busy_lock = VPB_UNBUSIED; 723 } 724 } 725 726 int 727 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, 728 vm_memattr_t memattr) 729 { 730 struct vm_phys_fictitious_seg *seg; 731 vm_page_t fp; 732 long page_count; 733 #ifdef VM_PHYSSEG_DENSE 734 long pi, pe; 735 long dpage_count; 736 #endif 737 738 KASSERT(start < end, 739 ("Start of segment isn't less than end (start: %jx end: %jx)", 740 (uintmax_t)start, (uintmax_t)end)); 741 742 page_count = (end - start) / PAGE_SIZE; 743 744 #ifdef VM_PHYSSEG_DENSE 745 pi = atop(start); 746 pe = atop(end); 747 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 748 fp = &vm_page_array[pi - first_page]; 749 if ((pe - first_page) > vm_page_array_size) { 750 /* 751 * We have a segment that starts inside 752 * of vm_page_array, but ends outside of it. 753 * 754 * Use vm_page_array pages for those that are 755 * inside of the vm_page_array range, and 756 * allocate the remaining ones. 757 */ 758 dpage_count = vm_page_array_size - (pi - first_page); 759 vm_phys_fictitious_init_range(fp, start, dpage_count, 760 memattr); 761 page_count -= dpage_count; 762 start += ptoa(dpage_count); 763 goto alloc; 764 } 765 /* 766 * We can allocate the full range from vm_page_array, 767 * so there's no need to register the range in the tree. 768 */ 769 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 770 return (0); 771 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 772 /* 773 * We have a segment that ends inside of vm_page_array, 774 * but starts outside of it. 775 */ 776 fp = &vm_page_array[0]; 777 dpage_count = pe - first_page; 778 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, 779 memattr); 780 end -= ptoa(dpage_count); 781 page_count -= dpage_count; 782 goto alloc; 783 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 784 /* 785 * Trying to register a fictitious range that expands before 786 * and after vm_page_array. 787 */ 788 return (EINVAL); 789 } else { 790 alloc: 791 #endif 792 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, 793 M_WAITOK); 794 #ifdef VM_PHYSSEG_DENSE 795 } 796 #endif 797 vm_phys_fictitious_init_range(fp, start, page_count, memattr); 798 799 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); 800 seg->start = start; 801 seg->end = end; 802 seg->first_page = fp; 803 804 rw_wlock(&vm_phys_fictitious_reg_lock); 805 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); 806 rw_wunlock(&vm_phys_fictitious_reg_lock); 807 808 return (0); 809 } 810 811 void 812 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) 813 { 814 struct vm_phys_fictitious_seg *seg, tmp; 815 #ifdef VM_PHYSSEG_DENSE 816 long pi, pe; 817 #endif 818 819 KASSERT(start < end, 820 ("Start of segment isn't less than end (start: %jx end: %jx)", 821 (uintmax_t)start, (uintmax_t)end)); 822 823 #ifdef VM_PHYSSEG_DENSE 824 pi = atop(start); 825 pe = atop(end); 826 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 827 if ((pe - first_page) <= vm_page_array_size) { 828 /* 829 * This segment was allocated using vm_page_array 830 * only, there's nothing to do since those pages 831 * were never added to the tree. 832 */ 833 return; 834 } 835 /* 836 * We have a segment that starts inside 837 * of vm_page_array, but ends outside of it. 838 * 839 * Calculate how many pages were added to the 840 * tree and free them. 841 */ 842 start = ptoa(first_page + vm_page_array_size); 843 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { 844 /* 845 * We have a segment that ends inside of vm_page_array, 846 * but starts outside of it. 847 */ 848 end = ptoa(first_page); 849 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { 850 /* Since it's not possible to register such a range, panic. */ 851 panic( 852 "Unregistering not registered fictitious range [%#jx:%#jx]", 853 (uintmax_t)start, (uintmax_t)end); 854 } 855 #endif 856 tmp.start = start; 857 tmp.end = 0; 858 859 rw_wlock(&vm_phys_fictitious_reg_lock); 860 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); 861 if (seg->start != start || seg->end != end) { 862 rw_wunlock(&vm_phys_fictitious_reg_lock); 863 panic( 864 "Unregistering not registered fictitious range [%#jx:%#jx]", 865 (uintmax_t)start, (uintmax_t)end); 866 } 867 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); 868 rw_wunlock(&vm_phys_fictitious_reg_lock); 869 free(seg->first_page, M_FICT_PAGES); 870 free(seg, M_FICT_PAGES); 871 } 872 873 /* 874 * Free a contiguous, power of two-sized set of physical pages. 875 * 876 * The free page queues must be locked. 877 */ 878 void 879 vm_phys_free_pages(vm_page_t m, int order) 880 { 881 struct vm_freelist *fl; 882 struct vm_phys_seg *seg; 883 vm_paddr_t pa; 884 vm_page_t m_buddy; 885 886 KASSERT(m->order == VM_NFREEORDER, 887 ("vm_phys_free_pages: page %p has unexpected order %d", 888 m, m->order)); 889 KASSERT(m->pool < VM_NFREEPOOL, 890 ("vm_phys_free_pages: page %p has unexpected pool %d", 891 m, m->pool)); 892 KASSERT(order < VM_NFREEORDER, 893 ("vm_phys_free_pages: order %d is out of range", order)); 894 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 895 seg = &vm_phys_segs[m->segind]; 896 if (order < VM_NFREEORDER - 1) { 897 pa = VM_PAGE_TO_PHYS(m); 898 do { 899 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); 900 if (pa < seg->start || pa >= seg->end) 901 break; 902 m_buddy = &seg->first_page[atop(pa - seg->start)]; 903 if (m_buddy->order != order) 904 break; 905 fl = (*seg->free_queues)[m_buddy->pool]; 906 vm_freelist_rem(fl, m_buddy, order); 907 if (m_buddy->pool != m->pool) 908 vm_phys_set_pool(m->pool, m_buddy, order); 909 order++; 910 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); 911 m = &seg->first_page[atop(pa - seg->start)]; 912 } while (order < VM_NFREEORDER - 1); 913 } 914 fl = (*seg->free_queues)[m->pool]; 915 vm_freelist_add(fl, m, order, 1); 916 } 917 918 /* 919 * Free a contiguous, arbitrarily sized set of physical pages. 920 * 921 * The free page queues must be locked. 922 */ 923 void 924 vm_phys_free_contig(vm_page_t m, u_long npages) 925 { 926 u_int n; 927 int order; 928 929 /* 930 * Avoid unnecessary coalescing by freeing the pages in the largest 931 * possible power-of-two-sized subsets. 932 */ 933 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 934 for (;; npages -= n) { 935 /* 936 * Unsigned "min" is used here so that "order" is assigned 937 * "VM_NFREEORDER - 1" when "m"'s physical address is zero 938 * or the low-order bits of its physical address are zero 939 * because the size of a physical address exceeds the size of 940 * a long. 941 */ 942 order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, 943 VM_NFREEORDER - 1); 944 n = 1 << order; 945 if (npages < n) 946 break; 947 vm_phys_free_pages(m, order); 948 m += n; 949 } 950 /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */ 951 for (; npages > 0; npages -= n) { 952 order = flsl(npages) - 1; 953 n = 1 << order; 954 vm_phys_free_pages(m, order); 955 m += n; 956 } 957 } 958 959 /* 960 * Scan physical memory between the specified addresses "low" and "high" for a 961 * run of contiguous physical pages that satisfy the specified conditions, and 962 * return the lowest page in the run. The specified "alignment" determines 963 * the alignment of the lowest physical page in the run. If the specified 964 * "boundary" is non-zero, then the run of physical pages cannot span a 965 * physical address that is a multiple of "boundary". 966 * 967 * "npages" must be greater than zero. Both "alignment" and "boundary" must 968 * be a power of two. 969 */ 970 vm_page_t 971 vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, 972 u_long alignment, vm_paddr_t boundary, int options) 973 { 974 vm_paddr_t pa_end; 975 vm_page_t m_end, m_run, m_start; 976 struct vm_phys_seg *seg; 977 int segind; 978 979 KASSERT(npages > 0, ("npages is 0")); 980 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 981 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 982 if (low >= high) 983 return (NULL); 984 for (segind = 0; segind < vm_phys_nsegs; segind++) { 985 seg = &vm_phys_segs[segind]; 986 if (seg->start >= high) 987 break; 988 if (low >= seg->end) 989 continue; 990 if (low <= seg->start) 991 m_start = seg->first_page; 992 else 993 m_start = &seg->first_page[atop(low - seg->start)]; 994 if (high < seg->end) 995 pa_end = high; 996 else 997 pa_end = seg->end; 998 if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) 999 continue; 1000 m_end = &seg->first_page[atop(pa_end - seg->start)]; 1001 m_run = vm_page_scan_contig(npages, m_start, m_end, 1002 alignment, boundary, options); 1003 if (m_run != NULL) 1004 return (m_run); 1005 } 1006 return (NULL); 1007 } 1008 1009 /* 1010 * Set the pool for a contiguous, power of two-sized set of physical pages. 1011 */ 1012 void 1013 vm_phys_set_pool(int pool, vm_page_t m, int order) 1014 { 1015 vm_page_t m_tmp; 1016 1017 for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) 1018 m_tmp->pool = pool; 1019 } 1020 1021 /* 1022 * Search for the given physical page "m" in the free lists. If the search 1023 * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return 1024 * FALSE, indicating that "m" is not in the free lists. 1025 * 1026 * The free page queues must be locked. 1027 */ 1028 boolean_t 1029 vm_phys_unfree_page(vm_page_t m) 1030 { 1031 struct vm_freelist *fl; 1032 struct vm_phys_seg *seg; 1033 vm_paddr_t pa, pa_half; 1034 vm_page_t m_set, m_tmp; 1035 int order; 1036 1037 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1038 1039 /* 1040 * First, find the contiguous, power of two-sized set of free 1041 * physical pages containing the given physical page "m" and 1042 * assign it to "m_set". 1043 */ 1044 seg = &vm_phys_segs[m->segind]; 1045 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && 1046 order < VM_NFREEORDER - 1; ) { 1047 order++; 1048 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); 1049 if (pa >= seg->start) 1050 m_set = &seg->first_page[atop(pa - seg->start)]; 1051 else 1052 return (FALSE); 1053 } 1054 if (m_set->order < order) 1055 return (FALSE); 1056 if (m_set->order == VM_NFREEORDER) 1057 return (FALSE); 1058 KASSERT(m_set->order < VM_NFREEORDER, 1059 ("vm_phys_unfree_page: page %p has unexpected order %d", 1060 m_set, m_set->order)); 1061 1062 /* 1063 * Next, remove "m_set" from the free lists. Finally, extract 1064 * "m" from "m_set" using an iterative algorithm: While "m_set" 1065 * is larger than a page, shrink "m_set" by returning the half 1066 * of "m_set" that does not contain "m" to the free lists. 1067 */ 1068 fl = (*seg->free_queues)[m_set->pool]; 1069 order = m_set->order; 1070 vm_freelist_rem(fl, m_set, order); 1071 while (order > 0) { 1072 order--; 1073 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); 1074 if (m->phys_addr < pa_half) 1075 m_tmp = &seg->first_page[atop(pa_half - seg->start)]; 1076 else { 1077 m_tmp = m_set; 1078 m_set = &seg->first_page[atop(pa_half - seg->start)]; 1079 } 1080 vm_freelist_add(fl, m_tmp, order, 0); 1081 } 1082 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); 1083 return (TRUE); 1084 } 1085 1086 /* 1087 * Allocate a contiguous set of physical pages of the given size 1088 * "npages" from the free lists. All of the physical pages must be at 1089 * or above the given physical address "low" and below the given 1090 * physical address "high". The given value "alignment" determines the 1091 * alignment of the first physical page in the set. If the given value 1092 * "boundary" is non-zero, then the set of physical pages cannot cross 1093 * any physical address boundary that is a multiple of that value. Both 1094 * "alignment" and "boundary" must be a power of two. 1095 */ 1096 vm_page_t 1097 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, 1098 u_long alignment, vm_paddr_t boundary) 1099 { 1100 vm_paddr_t pa_end, pa_start; 1101 vm_page_t m_run; 1102 struct vm_phys_seg *seg; 1103 int segind; 1104 1105 KASSERT(npages > 0, ("npages is 0")); 1106 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1107 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1108 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1109 if (low >= high) 1110 return (NULL); 1111 m_run = NULL; 1112 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { 1113 seg = &vm_phys_segs[segind]; 1114 if (seg->start >= high || seg->domain != domain) 1115 continue; 1116 if (low >= seg->end) 1117 break; 1118 if (low <= seg->start) 1119 pa_start = seg->start; 1120 else 1121 pa_start = low; 1122 if (high < seg->end) 1123 pa_end = high; 1124 else 1125 pa_end = seg->end; 1126 if (pa_end - pa_start < ptoa(npages)) 1127 continue; 1128 m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, 1129 alignment, boundary); 1130 if (m_run != NULL) 1131 break; 1132 } 1133 return (m_run); 1134 } 1135 1136 /* 1137 * Allocate a run of contiguous physical pages from the free list for the 1138 * specified segment. 1139 */ 1140 static vm_page_t 1141 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, 1142 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1143 { 1144 struct vm_freelist *fl; 1145 vm_paddr_t pa, pa_end, size; 1146 vm_page_t m, m_ret; 1147 u_long npages_end; 1148 int oind, order, pind; 1149 1150 KASSERT(npages > 0, ("npages is 0")); 1151 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 1152 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 1153 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1154 /* Compute the queue that is the best fit for npages. */ 1155 for (order = 0; (1 << order) < npages; order++); 1156 /* Search for a run satisfying the specified conditions. */ 1157 size = npages << PAGE_SHIFT; 1158 for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; 1159 oind++) { 1160 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1161 fl = (*seg->free_queues)[pind]; 1162 TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { 1163 /* 1164 * Is the size of this allocation request 1165 * larger than the largest block size? 1166 */ 1167 if (order >= VM_NFREEORDER) { 1168 /* 1169 * Determine if a sufficient number of 1170 * subsequent blocks to satisfy the 1171 * allocation request are free. 1172 */ 1173 pa = VM_PAGE_TO_PHYS(m_ret); 1174 pa_end = pa + size; 1175 for (;;) { 1176 pa += 1 << (PAGE_SHIFT + 1177 VM_NFREEORDER - 1); 1178 if (pa >= pa_end || 1179 pa < seg->start || 1180 pa >= seg->end) 1181 break; 1182 m = &seg->first_page[atop(pa - 1183 seg->start)]; 1184 if (m->order != VM_NFREEORDER - 1185 1) 1186 break; 1187 } 1188 /* If not, go to the next block. */ 1189 if (pa < pa_end) 1190 continue; 1191 } 1192 1193 /* 1194 * Determine if the blocks are within the 1195 * given range, satisfy the given alignment, 1196 * and do not cross the given boundary. 1197 */ 1198 pa = VM_PAGE_TO_PHYS(m_ret); 1199 pa_end = pa + size; 1200 if (pa >= low && pa_end <= high && 1201 (pa & (alignment - 1)) == 0 && 1202 rounddown2(pa ^ (pa_end - 1), boundary) == 0) 1203 goto done; 1204 } 1205 } 1206 } 1207 return (NULL); 1208 done: 1209 for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { 1210 fl = (*seg->free_queues)[m->pool]; 1211 vm_freelist_rem(fl, m, m->order); 1212 } 1213 if (m_ret->pool != VM_FREEPOOL_DEFAULT) 1214 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind); 1215 fl = (*seg->free_queues)[m_ret->pool]; 1216 vm_phys_split_pages(m_ret, oind, fl, order); 1217 /* Return excess pages to the free lists. */ 1218 npages_end = roundup2(npages, 1 << imin(oind, order)); 1219 if (npages < npages_end) 1220 vm_phys_free_contig(&m_ret[npages], npages_end - npages); 1221 return (m_ret); 1222 } 1223 1224 #ifdef DDB 1225 /* 1226 * Show the number of physical pages in each of the free lists. 1227 */ 1228 DB_SHOW_COMMAND(freepages, db_show_freepages) 1229 { 1230 struct vm_freelist *fl; 1231 int flind, oind, pind, dom; 1232 1233 for (dom = 0; dom < vm_ndomains; dom++) { 1234 db_printf("DOMAIN: %d\n", dom); 1235 for (flind = 0; flind < vm_nfreelists; flind++) { 1236 db_printf("FREE LIST %d:\n" 1237 "\n ORDER (SIZE) | NUMBER" 1238 "\n ", flind); 1239 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1240 db_printf(" | POOL %d", pind); 1241 db_printf("\n-- "); 1242 for (pind = 0; pind < VM_NFREEPOOL; pind++) 1243 db_printf("-- -- "); 1244 db_printf("--\n"); 1245 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { 1246 db_printf(" %2.2d (%6.6dK)", oind, 1247 1 << (PAGE_SHIFT - 10 + oind)); 1248 for (pind = 0; pind < VM_NFREEPOOL; pind++) { 1249 fl = vm_phys_free_queues[dom][flind][pind]; 1250 db_printf(" | %6.6d", fl[oind].lcnt); 1251 } 1252 db_printf("\n"); 1253 } 1254 db_printf("\n"); 1255 } 1256 db_printf("\n"); 1257 } 1258 } 1259 #endif 1260