1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Superpage reservation management module 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_vm.h" 45 46 #include <sys/param.h> 47 #include <sys/kernel.h> 48 #include <sys/lock.h> 49 #include <sys/malloc.h> 50 #include <sys/mutex.h> 51 #include <sys/queue.h> 52 #include <sys/rwlock.h> 53 #include <sys/sbuf.h> 54 #include <sys/sysctl.h> 55 #include <sys/systm.h> 56 #include <sys/counter.h> 57 #include <sys/ktr.h> 58 #include <sys/vmmeter.h> 59 #include <sys/smp.h> 60 61 #include <vm/vm.h> 62 #include <vm/vm_param.h> 63 #include <vm/vm_object.h> 64 #include <vm/vm_page.h> 65 #include <vm/vm_pageout.h> 66 #include <vm/vm_phys.h> 67 #include <vm/vm_pagequeue.h> 68 #include <vm/vm_radix.h> 69 #include <vm/vm_reserv.h> 70 71 /* 72 * The reservation system supports the speculative allocation of large physical 73 * pages ("superpages"). Speculative allocation enables the fully automatic 74 * utilization of superpages by the virtual memory system. In other words, no 75 * programmatic directives are required to use superpages. 76 */ 77 78 #if VM_NRESERVLEVEL > 0 79 80 #ifndef VM_LEVEL_0_ORDER_MAX 81 #define VM_LEVEL_0_ORDER_MAX VM_LEVEL_0_ORDER 82 #endif 83 84 /* 85 * The number of small pages that are contained in a level 0 reservation 86 */ 87 #define VM_LEVEL_0_NPAGES (1 << VM_LEVEL_0_ORDER) 88 #define VM_LEVEL_0_NPAGES_MAX (1 << VM_LEVEL_0_ORDER_MAX) 89 90 /* 91 * The number of bits by which a physical address is shifted to obtain the 92 * reservation number 93 */ 94 #define VM_LEVEL_0_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) 95 96 /* 97 * The size of a level 0 reservation in bytes 98 */ 99 #define VM_LEVEL_0_SIZE (1 << VM_LEVEL_0_SHIFT) 100 101 /* 102 * Computes the index of the small page underlying the given (object, pindex) 103 * within the reservation's array of small pages. 104 */ 105 #define VM_RESERV_INDEX(object, pindex) \ 106 (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1)) 107 108 /* 109 * The size of a population map entry 110 */ 111 typedef u_long popmap_t; 112 113 /* 114 * The number of bits in a population map entry 115 */ 116 #define NBPOPMAP (NBBY * sizeof(popmap_t)) 117 118 /* 119 * The number of population map entries in a reservation 120 */ 121 #define NPOPMAP howmany(VM_LEVEL_0_NPAGES, NBPOPMAP) 122 #define NPOPMAP_MAX howmany(VM_LEVEL_0_NPAGES_MAX, NBPOPMAP) 123 124 /* 125 * Number of elapsed ticks before we update the LRU queue position. Used 126 * to reduce contention and churn on the list. 127 */ 128 #define PARTPOPSLOP 1 129 130 /* 131 * Clear a bit in the population map. 132 */ 133 static __inline void 134 popmap_clear(popmap_t popmap[], int i) 135 { 136 137 popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP)); 138 } 139 140 /* 141 * Set a bit in the population map. 142 */ 143 static __inline void 144 popmap_set(popmap_t popmap[], int i) 145 { 146 147 popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP); 148 } 149 150 /* 151 * Is a bit in the population map clear? 152 */ 153 static __inline boolean_t 154 popmap_is_clear(popmap_t popmap[], int i) 155 { 156 157 return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0); 158 } 159 160 /* 161 * Is a bit in the population map set? 162 */ 163 static __inline boolean_t 164 popmap_is_set(popmap_t popmap[], int i) 165 { 166 167 return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0); 168 } 169 170 /* 171 * The reservation structure 172 * 173 * A reservation structure is constructed whenever a large physical page is 174 * speculatively allocated to an object. The reservation provides the small 175 * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets 176 * within that object. The reservation's "popcnt" tracks the number of these 177 * small physical pages that are in use at any given time. When and if the 178 * reservation is not fully utilized, it appears in the queue of partially 179 * populated reservations. The reservation always appears on the containing 180 * object's list of reservations. 181 * 182 * A partially populated reservation can be broken and reclaimed at any time. 183 * 184 * c - constant after boot 185 * d - vm_reserv_domain_lock 186 * o - vm_reserv_object_lock 187 * r - vm_reserv_lock 188 * s - vm_reserv_domain_scan_lock 189 */ 190 struct vm_reserv { 191 struct mtx lock; /* reservation lock. */ 192 TAILQ_ENTRY(vm_reserv) partpopq; /* (d, r) per-domain queue. */ 193 LIST_ENTRY(vm_reserv) objq; /* (o, r) object queue */ 194 vm_object_t object; /* (o, r) containing object */ 195 vm_pindex_t pindex; /* (o, r) offset in object */ 196 vm_page_t pages; /* (c) first page */ 197 uint16_t popcnt; /* (r) # of pages in use */ 198 uint8_t domain; /* (c) NUMA domain. */ 199 char inpartpopq; /* (d, r) */ 200 int lasttick; /* (r) last pop update tick. */ 201 popmap_t popmap[NPOPMAP_MAX]; /* (r) bit vector, used pages */ 202 }; 203 204 TAILQ_HEAD(vm_reserv_queue, vm_reserv); 205 206 #define vm_reserv_lockptr(rv) (&(rv)->lock) 207 #define vm_reserv_assert_locked(rv) \ 208 mtx_assert(vm_reserv_lockptr(rv), MA_OWNED) 209 #define vm_reserv_lock(rv) mtx_lock(vm_reserv_lockptr(rv)) 210 #define vm_reserv_trylock(rv) mtx_trylock(vm_reserv_lockptr(rv)) 211 #define vm_reserv_unlock(rv) mtx_unlock(vm_reserv_lockptr(rv)) 212 213 /* 214 * The reservation array 215 * 216 * This array is analoguous in function to vm_page_array. It differs in the 217 * respect that it may contain a greater number of useful reservation 218 * structures than there are (physical) superpages. These "invalid" 219 * reservation structures exist to trade-off space for time in the 220 * implementation of vm_reserv_from_page(). Invalid reservation structures are 221 * distinguishable from "valid" reservation structures by inspecting the 222 * reservation's "pages" field. Invalid reservation structures have a NULL 223 * "pages" field. 224 * 225 * vm_reserv_from_page() maps a small (physical) page to an element of this 226 * array by computing a physical reservation number from the page's physical 227 * address. The physical reservation number is used as the array index. 228 * 229 * An "active" reservation is a valid reservation structure that has a non-NULL 230 * "object" field and a non-zero "popcnt" field. In other words, every active 231 * reservation belongs to a particular object. Moreover, every active 232 * reservation has an entry in the containing object's list of reservations. 233 */ 234 static vm_reserv_t vm_reserv_array; 235 236 /* 237 * The per-domain partially populated reservation queues 238 * 239 * These queues enable the fast recovery of an unused free small page from a 240 * partially populated reservation. The reservation at the head of a queue 241 * is the least recently changed, partially populated reservation. 242 * 243 * Access to this queue is synchronized by the per-domain reservation lock. 244 * Threads reclaiming free pages from the queue must hold the per-domain scan 245 * lock. 246 */ 247 struct vm_reserv_domain { 248 struct mtx lock; 249 struct vm_reserv_queue partpop; /* (d) */ 250 struct vm_reserv marker; /* (d, s) scan marker/lock */ 251 } __aligned(CACHE_LINE_SIZE); 252 253 static struct vm_reserv_domain vm_rvd[MAXMEMDOM]; 254 255 #define vm_reserv_domain_lockptr(d) (&vm_rvd[(d)].lock) 256 #define vm_reserv_domain_assert_locked(d) \ 257 mtx_assert(vm_reserv_domain_lockptr(d), MA_OWNED) 258 #define vm_reserv_domain_lock(d) mtx_lock(vm_reserv_domain_lockptr(d)) 259 #define vm_reserv_domain_unlock(d) mtx_unlock(vm_reserv_domain_lockptr(d)) 260 261 #define vm_reserv_domain_scan_lock(d) mtx_lock(&vm_rvd[(d)].marker.lock) 262 #define vm_reserv_domain_scan_unlock(d) mtx_unlock(&vm_rvd[(d)].marker.lock) 263 264 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 265 "Reservation Info"); 266 267 static COUNTER_U64_DEFINE_EARLY(vm_reserv_broken); 268 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD, 269 &vm_reserv_broken, "Cumulative number of broken reservations"); 270 271 static COUNTER_U64_DEFINE_EARLY(vm_reserv_freed); 272 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD, 273 &vm_reserv_freed, "Cumulative number of freed reservations"); 274 275 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS); 276 277 SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, 278 NULL, 0, sysctl_vm_reserv_fullpop, "I", "Current number of full reservations"); 279 280 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS); 281 282 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, 283 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, 284 sysctl_vm_reserv_partpopq, "A", 285 "Partially populated reservation queues"); 286 287 static COUNTER_U64_DEFINE_EARLY(vm_reserv_reclaimed); 288 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD, 289 &vm_reserv_reclaimed, "Cumulative number of reclaimed reservations"); 290 291 /* 292 * The object lock pool is used to synchronize the rvq. We can not use a 293 * pool mutex because it is required before malloc works. 294 * 295 * The "hash" function could be made faster without divide and modulo. 296 */ 297 #define VM_RESERV_OBJ_LOCK_COUNT MAXCPU 298 299 struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT]; 300 301 #define vm_reserv_object_lock_idx(object) \ 302 (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT) 303 #define vm_reserv_object_lock_ptr(object) \ 304 &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))] 305 #define vm_reserv_object_lock(object) \ 306 mtx_lock(vm_reserv_object_lock_ptr((object))) 307 #define vm_reserv_object_unlock(object) \ 308 mtx_unlock(vm_reserv_object_lock_ptr((object))) 309 310 static void vm_reserv_break(vm_reserv_t rv); 311 static void vm_reserv_depopulate(vm_reserv_t rv, int index); 312 static vm_reserv_t vm_reserv_from_page(vm_page_t m); 313 static boolean_t vm_reserv_has_pindex(vm_reserv_t rv, 314 vm_pindex_t pindex); 315 static void vm_reserv_populate(vm_reserv_t rv, int index); 316 static void vm_reserv_reclaim(vm_reserv_t rv); 317 318 /* 319 * Returns the current number of full reservations. 320 * 321 * Since the number of full reservations is computed without acquiring any 322 * locks, the returned value is inexact. 323 */ 324 static int 325 sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS) 326 { 327 vm_paddr_t paddr; 328 struct vm_phys_seg *seg; 329 vm_reserv_t rv; 330 int fullpop, segind; 331 332 fullpop = 0; 333 for (segind = 0; segind < vm_phys_nsegs; segind++) { 334 seg = &vm_phys_segs[segind]; 335 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); 336 while (paddr + VM_LEVEL_0_SIZE > paddr && paddr + 337 VM_LEVEL_0_SIZE <= seg->end) { 338 rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT]; 339 fullpop += rv->popcnt == VM_LEVEL_0_NPAGES; 340 paddr += VM_LEVEL_0_SIZE; 341 } 342 } 343 return (sysctl_handle_int(oidp, &fullpop, 0, req)); 344 } 345 346 /* 347 * Describes the current state of the partially populated reservation queue. 348 */ 349 static int 350 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS) 351 { 352 struct sbuf sbuf; 353 vm_reserv_t rv; 354 int counter, error, domain, level, unused_pages; 355 356 error = sysctl_wire_old_buffer(req, 0); 357 if (error != 0) 358 return (error); 359 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 360 sbuf_printf(&sbuf, "\nDOMAIN LEVEL SIZE NUMBER\n\n"); 361 for (domain = 0; domain < vm_ndomains; domain++) { 362 for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) { 363 counter = 0; 364 unused_pages = 0; 365 vm_reserv_domain_lock(domain); 366 TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) { 367 if (rv == &vm_rvd[domain].marker) 368 continue; 369 counter++; 370 unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt; 371 } 372 vm_reserv_domain_unlock(domain); 373 sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n", 374 domain, level, 375 unused_pages * ((int)PAGE_SIZE / 1024), counter); 376 } 377 } 378 error = sbuf_finish(&sbuf); 379 sbuf_delete(&sbuf); 380 return (error); 381 } 382 383 /* 384 * Remove a reservation from the object's objq. 385 */ 386 static void 387 vm_reserv_remove(vm_reserv_t rv) 388 { 389 vm_object_t object; 390 391 vm_reserv_assert_locked(rv); 392 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 393 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 394 KASSERT(rv->object != NULL, 395 ("vm_reserv_remove: reserv %p is free", rv)); 396 KASSERT(!rv->inpartpopq, 397 ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv)); 398 object = rv->object; 399 vm_reserv_object_lock(object); 400 LIST_REMOVE(rv, objq); 401 rv->object = NULL; 402 vm_reserv_object_unlock(object); 403 } 404 405 /* 406 * Insert a new reservation into the object's objq. 407 */ 408 static void 409 vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex) 410 { 411 int i; 412 413 vm_reserv_assert_locked(rv); 414 CTR6(KTR_VM, 415 "%s: rv %p(%p) object %p new %p popcnt %d", 416 __FUNCTION__, rv, rv->pages, rv->object, object, 417 rv->popcnt); 418 KASSERT(rv->object == NULL, 419 ("vm_reserv_insert: reserv %p isn't free", rv)); 420 KASSERT(rv->popcnt == 0, 421 ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv)); 422 KASSERT(!rv->inpartpopq, 423 ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv)); 424 for (i = 0; i < NPOPMAP; i++) 425 KASSERT(rv->popmap[i] == 0, 426 ("vm_reserv_insert: reserv %p's popmap is corrupted", rv)); 427 vm_reserv_object_lock(object); 428 rv->pindex = pindex; 429 rv->object = object; 430 rv->lasttick = ticks; 431 LIST_INSERT_HEAD(&object->rvq, rv, objq); 432 vm_reserv_object_unlock(object); 433 } 434 435 /* 436 * Reduces the given reservation's population count. If the population count 437 * becomes zero, the reservation is destroyed. Additionally, moves the 438 * reservation to the tail of the partially populated reservation queue if the 439 * population count is non-zero. 440 */ 441 static void 442 vm_reserv_depopulate(vm_reserv_t rv, int index) 443 { 444 struct vm_domain *vmd; 445 446 vm_reserv_assert_locked(rv); 447 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 448 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 449 KASSERT(rv->object != NULL, 450 ("vm_reserv_depopulate: reserv %p is free", rv)); 451 KASSERT(popmap_is_set(rv->popmap, index), 452 ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv, 453 index)); 454 KASSERT(rv->popcnt > 0, 455 ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv)); 456 KASSERT(rv->domain < vm_ndomains, 457 ("vm_reserv_depopulate: reserv %p's domain is corrupted %d", 458 rv, rv->domain)); 459 if (rv->popcnt == VM_LEVEL_0_NPAGES) { 460 KASSERT(rv->pages->psind == 1, 461 ("vm_reserv_depopulate: reserv %p is already demoted", 462 rv)); 463 rv->pages->psind = 0; 464 } 465 popmap_clear(rv->popmap, index); 466 rv->popcnt--; 467 if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP || 468 rv->popcnt == 0) { 469 vm_reserv_domain_lock(rv->domain); 470 if (rv->inpartpopq) { 471 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); 472 rv->inpartpopq = FALSE; 473 } 474 if (rv->popcnt != 0) { 475 rv->inpartpopq = TRUE; 476 TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, 477 partpopq); 478 } 479 vm_reserv_domain_unlock(rv->domain); 480 rv->lasttick = ticks; 481 } 482 vmd = VM_DOMAIN(rv->domain); 483 if (rv->popcnt == 0) { 484 vm_reserv_remove(rv); 485 vm_domain_free_lock(vmd); 486 vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER); 487 vm_domain_free_unlock(vmd); 488 counter_u64_add(vm_reserv_freed, 1); 489 } 490 vm_domain_freecnt_inc(vmd, 1); 491 } 492 493 /* 494 * Returns the reservation to which the given page might belong. 495 */ 496 static __inline vm_reserv_t 497 vm_reserv_from_page(vm_page_t m) 498 { 499 500 return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]); 501 } 502 503 /* 504 * Returns an existing reservation or NULL and initialized successor pointer. 505 */ 506 static vm_reserv_t 507 vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex, 508 vm_page_t mpred, vm_page_t *msuccp) 509 { 510 vm_reserv_t rv; 511 vm_page_t msucc; 512 513 msucc = NULL; 514 if (mpred != NULL) { 515 KASSERT(mpred->object == object, 516 ("vm_reserv_from_object: object doesn't contain mpred")); 517 KASSERT(mpred->pindex < pindex, 518 ("vm_reserv_from_object: mpred doesn't precede pindex")); 519 rv = vm_reserv_from_page(mpred); 520 if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) 521 goto found; 522 msucc = TAILQ_NEXT(mpred, listq); 523 } else 524 msucc = TAILQ_FIRST(&object->memq); 525 if (msucc != NULL) { 526 KASSERT(msucc->pindex > pindex, 527 ("vm_reserv_from_object: msucc doesn't succeed pindex")); 528 rv = vm_reserv_from_page(msucc); 529 if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) 530 goto found; 531 } 532 rv = NULL; 533 534 found: 535 *msuccp = msucc; 536 537 return (rv); 538 } 539 540 /* 541 * Returns TRUE if the given reservation contains the given page index and 542 * FALSE otherwise. 543 */ 544 static __inline boolean_t 545 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex) 546 { 547 548 return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0); 549 } 550 551 /* 552 * Increases the given reservation's population count. Moves the reservation 553 * to the tail of the partially populated reservation queue. 554 */ 555 static void 556 vm_reserv_populate(vm_reserv_t rv, int index) 557 { 558 559 vm_reserv_assert_locked(rv); 560 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 561 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 562 KASSERT(rv->object != NULL, 563 ("vm_reserv_populate: reserv %p is free", rv)); 564 KASSERT(popmap_is_clear(rv->popmap, index), 565 ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv, 566 index)); 567 KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES, 568 ("vm_reserv_populate: reserv %p is already full", rv)); 569 KASSERT(rv->pages->psind == 0, 570 ("vm_reserv_populate: reserv %p is already promoted", rv)); 571 KASSERT(rv->domain < vm_ndomains, 572 ("vm_reserv_populate: reserv %p's domain is corrupted %d", 573 rv, rv->domain)); 574 popmap_set(rv->popmap, index); 575 rv->popcnt++; 576 if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP && 577 rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES) 578 return; 579 rv->lasttick = ticks; 580 vm_reserv_domain_lock(rv->domain); 581 if (rv->inpartpopq) { 582 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); 583 rv->inpartpopq = FALSE; 584 } 585 if (rv->popcnt < VM_LEVEL_0_NPAGES) { 586 rv->inpartpopq = TRUE; 587 TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq); 588 } else { 589 KASSERT(rv->pages->psind == 0, 590 ("vm_reserv_populate: reserv %p is already promoted", 591 rv)); 592 rv->pages->psind = 1; 593 } 594 vm_reserv_domain_unlock(rv->domain); 595 } 596 597 /* 598 * Allocates a contiguous set of physical pages of the given size "npages" 599 * from existing or newly created reservations. All of the physical pages 600 * must be at or above the given physical address "low" and below the given 601 * physical address "high". The given value "alignment" determines the 602 * alignment of the first physical page in the set. If the given value 603 * "boundary" is non-zero, then the set of physical pages cannot cross any 604 * physical address boundary that is a multiple of that value. Both 605 * "alignment" and "boundary" must be a power of two. 606 * 607 * The page "mpred" must immediately precede the offset "pindex" within the 608 * specified object. 609 * 610 * The object must be locked. 611 */ 612 vm_page_t 613 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain, 614 int req, vm_page_t mpred, u_long npages, vm_paddr_t low, vm_paddr_t high, 615 u_long alignment, vm_paddr_t boundary) 616 { 617 struct vm_domain *vmd; 618 vm_paddr_t pa, size; 619 vm_page_t m, m_ret, msucc; 620 vm_pindex_t first, leftcap, rightcap; 621 vm_reserv_t rv; 622 u_long allocpages, maxpages, minpages; 623 int i, index, n; 624 625 VM_OBJECT_ASSERT_WLOCKED(object); 626 KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0")); 627 628 /* 629 * Is a reservation fundamentally impossible? 630 */ 631 if (pindex < VM_RESERV_INDEX(object, pindex) || 632 pindex + npages > object->size) 633 return (NULL); 634 635 /* 636 * All reservations of a particular size have the same alignment. 637 * Assuming that the first page is allocated from a reservation, the 638 * least significant bits of its physical address can be determined 639 * from its offset from the beginning of the reservation and the size 640 * of the reservation. 641 * 642 * Could the specified index within a reservation of the smallest 643 * possible size satisfy the alignment and boundary requirements? 644 */ 645 pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT; 646 if ((pa & (alignment - 1)) != 0) 647 return (NULL); 648 size = npages << PAGE_SHIFT; 649 if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) 650 return (NULL); 651 652 /* 653 * Look for an existing reservation. 654 */ 655 rv = vm_reserv_from_object(object, pindex, mpred, &msucc); 656 if (rv != NULL) { 657 KASSERT(object != kernel_object || rv->domain == domain, 658 ("vm_reserv_alloc_contig: domain mismatch")); 659 index = VM_RESERV_INDEX(object, pindex); 660 /* Does the allocation fit within the reservation? */ 661 if (index + npages > VM_LEVEL_0_NPAGES) 662 return (NULL); 663 domain = rv->domain; 664 vmd = VM_DOMAIN(domain); 665 vm_reserv_lock(rv); 666 /* Handle reclaim race. */ 667 if (rv->object != object) 668 goto out; 669 m = &rv->pages[index]; 670 pa = VM_PAGE_TO_PHYS(m); 671 if (pa < low || pa + size > high || 672 (pa & (alignment - 1)) != 0 || 673 ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) 674 goto out; 675 /* Handle vm_page_rename(m, new_object, ...). */ 676 for (i = 0; i < npages; i++) 677 if (popmap_is_set(rv->popmap, index + i)) 678 goto out; 679 if (!vm_domain_allocate(vmd, req, npages)) 680 goto out; 681 for (i = 0; i < npages; i++) 682 vm_reserv_populate(rv, index + i); 683 vm_reserv_unlock(rv); 684 return (m); 685 out: 686 vm_reserv_unlock(rv); 687 return (NULL); 688 } 689 690 /* 691 * Could at least one reservation fit between the first index to the 692 * left that can be used ("leftcap") and the first index to the right 693 * that cannot be used ("rightcap")? 694 * 695 * We must synchronize with the reserv object lock to protect the 696 * pindex/object of the resulting reservations against rename while 697 * we are inspecting. 698 */ 699 first = pindex - VM_RESERV_INDEX(object, pindex); 700 minpages = VM_RESERV_INDEX(object, pindex) + npages; 701 maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES); 702 allocpages = maxpages; 703 vm_reserv_object_lock(object); 704 if (mpred != NULL) { 705 if ((rv = vm_reserv_from_page(mpred))->object != object) 706 leftcap = mpred->pindex + 1; 707 else 708 leftcap = rv->pindex + VM_LEVEL_0_NPAGES; 709 if (leftcap > first) { 710 vm_reserv_object_unlock(object); 711 return (NULL); 712 } 713 } 714 if (msucc != NULL) { 715 if ((rv = vm_reserv_from_page(msucc))->object != object) 716 rightcap = msucc->pindex; 717 else 718 rightcap = rv->pindex; 719 if (first + maxpages > rightcap) { 720 if (maxpages == VM_LEVEL_0_NPAGES) { 721 vm_reserv_object_unlock(object); 722 return (NULL); 723 } 724 725 /* 726 * At least one reservation will fit between "leftcap" 727 * and "rightcap". However, a reservation for the 728 * last of the requested pages will not fit. Reduce 729 * the size of the upcoming allocation accordingly. 730 */ 731 allocpages = minpages; 732 } 733 } 734 vm_reserv_object_unlock(object); 735 736 /* 737 * Would the last new reservation extend past the end of the object? 738 * 739 * If the object is unlikely to grow don't allocate a reservation for 740 * the tail. 741 */ 742 if ((object->flags & OBJ_ANON) == 0 && 743 first + maxpages > object->size) { 744 if (maxpages == VM_LEVEL_0_NPAGES) 745 return (NULL); 746 allocpages = minpages; 747 } 748 749 /* 750 * Allocate the physical pages. The alignment and boundary specified 751 * for this allocation may be different from the alignment and 752 * boundary specified for the requested pages. For instance, the 753 * specified index may not be the first page within the first new 754 * reservation. 755 */ 756 m = NULL; 757 vmd = VM_DOMAIN(domain); 758 if (vm_domain_allocate(vmd, req, npages)) { 759 vm_domain_free_lock(vmd); 760 m = vm_phys_alloc_contig(domain, allocpages, low, high, 761 ulmax(alignment, VM_LEVEL_0_SIZE), 762 boundary > VM_LEVEL_0_SIZE ? boundary : 0); 763 vm_domain_free_unlock(vmd); 764 if (m == NULL) { 765 vm_domain_freecnt_inc(vmd, npages); 766 return (NULL); 767 } 768 } else 769 return (NULL); 770 KASSERT(vm_phys_domain(m) == domain, 771 ("vm_reserv_alloc_contig: Page domain does not match requested.")); 772 773 /* 774 * The allocated physical pages always begin at a reservation 775 * boundary, but they do not always end at a reservation boundary. 776 * Initialize every reservation that is completely covered by the 777 * allocated physical pages. 778 */ 779 m_ret = NULL; 780 index = VM_RESERV_INDEX(object, pindex); 781 do { 782 rv = vm_reserv_from_page(m); 783 KASSERT(rv->pages == m, 784 ("vm_reserv_alloc_contig: reserv %p's pages is corrupted", 785 rv)); 786 vm_reserv_lock(rv); 787 vm_reserv_insert(rv, object, first); 788 n = ulmin(VM_LEVEL_0_NPAGES - index, npages); 789 for (i = 0; i < n; i++) 790 vm_reserv_populate(rv, index + i); 791 npages -= n; 792 if (m_ret == NULL) { 793 m_ret = &rv->pages[index]; 794 index = 0; 795 } 796 vm_reserv_unlock(rv); 797 m += VM_LEVEL_0_NPAGES; 798 first += VM_LEVEL_0_NPAGES; 799 allocpages -= VM_LEVEL_0_NPAGES; 800 } while (allocpages >= VM_LEVEL_0_NPAGES); 801 return (m_ret); 802 } 803 804 /* 805 * Allocate a physical page from an existing or newly created reservation. 806 * 807 * The page "mpred" must immediately precede the offset "pindex" within the 808 * specified object. 809 * 810 * The object must be locked. 811 */ 812 vm_page_t 813 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain, 814 int req, vm_page_t mpred) 815 { 816 struct vm_domain *vmd; 817 vm_page_t m, msucc; 818 vm_pindex_t first, leftcap, rightcap; 819 vm_reserv_t rv; 820 int index; 821 822 VM_OBJECT_ASSERT_WLOCKED(object); 823 824 /* 825 * Is a reservation fundamentally impossible? 826 */ 827 if (pindex < VM_RESERV_INDEX(object, pindex) || 828 pindex >= object->size) 829 return (NULL); 830 831 /* 832 * Look for an existing reservation. 833 */ 834 rv = vm_reserv_from_object(object, pindex, mpred, &msucc); 835 if (rv != NULL) { 836 KASSERT(object != kernel_object || rv->domain == domain, 837 ("vm_reserv_alloc_page: domain mismatch")); 838 domain = rv->domain; 839 vmd = VM_DOMAIN(domain); 840 index = VM_RESERV_INDEX(object, pindex); 841 m = &rv->pages[index]; 842 vm_reserv_lock(rv); 843 /* Handle reclaim race. */ 844 if (rv->object != object || 845 /* Handle vm_page_rename(m, new_object, ...). */ 846 popmap_is_set(rv->popmap, index)) { 847 m = NULL; 848 goto out; 849 } 850 if (vm_domain_allocate(vmd, req, 1) == 0) 851 m = NULL; 852 else 853 vm_reserv_populate(rv, index); 854 out: 855 vm_reserv_unlock(rv); 856 return (m); 857 } 858 859 /* 860 * Could a reservation fit between the first index to the left that 861 * can be used and the first index to the right that cannot be used? 862 * 863 * We must synchronize with the reserv object lock to protect the 864 * pindex/object of the resulting reservations against rename while 865 * we are inspecting. 866 */ 867 first = pindex - VM_RESERV_INDEX(object, pindex); 868 vm_reserv_object_lock(object); 869 if (mpred != NULL) { 870 if ((rv = vm_reserv_from_page(mpred))->object != object) 871 leftcap = mpred->pindex + 1; 872 else 873 leftcap = rv->pindex + VM_LEVEL_0_NPAGES; 874 if (leftcap > first) { 875 vm_reserv_object_unlock(object); 876 return (NULL); 877 } 878 } 879 if (msucc != NULL) { 880 if ((rv = vm_reserv_from_page(msucc))->object != object) 881 rightcap = msucc->pindex; 882 else 883 rightcap = rv->pindex; 884 if (first + VM_LEVEL_0_NPAGES > rightcap) { 885 vm_reserv_object_unlock(object); 886 return (NULL); 887 } 888 } 889 vm_reserv_object_unlock(object); 890 891 /* 892 * Would the last new reservation extend past the end of the object? 893 * 894 * If the object is unlikely to grow don't allocate a reservation for 895 * the tail. 896 */ 897 if ((object->flags & OBJ_ANON) == 0 && 898 first + VM_LEVEL_0_NPAGES > object->size) 899 return (NULL); 900 901 /* 902 * Allocate and populate the new reservation. 903 */ 904 m = NULL; 905 vmd = VM_DOMAIN(domain); 906 if (vm_domain_allocate(vmd, req, 1)) { 907 vm_domain_free_lock(vmd); 908 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, 909 VM_LEVEL_0_ORDER); 910 vm_domain_free_unlock(vmd); 911 if (m == NULL) { 912 vm_domain_freecnt_inc(vmd, 1); 913 return (NULL); 914 } 915 } else 916 return (NULL); 917 rv = vm_reserv_from_page(m); 918 vm_reserv_lock(rv); 919 KASSERT(rv->pages == m, 920 ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv)); 921 vm_reserv_insert(rv, object, first); 922 index = VM_RESERV_INDEX(object, pindex); 923 vm_reserv_populate(rv, index); 924 vm_reserv_unlock(rv); 925 926 return (&rv->pages[index]); 927 } 928 929 /* 930 * Breaks the given reservation. All free pages in the reservation 931 * are returned to the physical memory allocator. The reservation's 932 * population count and map are reset to their initial state. 933 * 934 * The given reservation must not be in the partially populated reservation 935 * queue. 936 */ 937 static void 938 vm_reserv_break(vm_reserv_t rv) 939 { 940 u_long changes; 941 int bitpos, hi, i, lo; 942 943 vm_reserv_assert_locked(rv); 944 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 945 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 946 vm_reserv_remove(rv); 947 rv->pages->psind = 0; 948 hi = lo = -1; 949 for (i = 0; i <= NPOPMAP; i++) { 950 /* 951 * "changes" is a bitmask that marks where a new sequence of 952 * 0s or 1s begins in popmap[i], with last bit in popmap[i-1] 953 * considered to be 1 if and only if lo == hi. The bits of 954 * popmap[-1] and popmap[NPOPMAP] are considered all 1s. 955 */ 956 if (i == NPOPMAP) 957 changes = lo != hi; 958 else { 959 changes = rv->popmap[i]; 960 changes ^= (changes << 1) | (lo == hi); 961 rv->popmap[i] = 0; 962 } 963 while (changes != 0) { 964 /* 965 * If the next change marked begins a run of 0s, set 966 * lo to mark that position. Otherwise set hi and 967 * free pages from lo up to hi. 968 */ 969 bitpos = ffsl(changes) - 1; 970 changes ^= 1UL << bitpos; 971 if (lo == hi) 972 lo = NBPOPMAP * i + bitpos; 973 else { 974 hi = NBPOPMAP * i + bitpos; 975 vm_domain_free_lock(VM_DOMAIN(rv->domain)); 976 vm_phys_enqueue_contig(&rv->pages[lo], hi - lo); 977 vm_domain_free_unlock(VM_DOMAIN(rv->domain)); 978 lo = hi; 979 } 980 } 981 } 982 rv->popcnt = 0; 983 counter_u64_add(vm_reserv_broken, 1); 984 } 985 986 /* 987 * Breaks all reservations belonging to the given object. 988 */ 989 void 990 vm_reserv_break_all(vm_object_t object) 991 { 992 vm_reserv_t rv; 993 994 /* 995 * This access of object->rvq is unsynchronized so that the 996 * object rvq lock can nest after the domain_free lock. We 997 * must check for races in the results. However, the object 998 * lock prevents new additions, so we are guaranteed that when 999 * it returns NULL the object is properly empty. 1000 */ 1001 while ((rv = LIST_FIRST(&object->rvq)) != NULL) { 1002 vm_reserv_lock(rv); 1003 /* Reclaim race. */ 1004 if (rv->object != object) { 1005 vm_reserv_unlock(rv); 1006 continue; 1007 } 1008 vm_reserv_domain_lock(rv->domain); 1009 if (rv->inpartpopq) { 1010 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); 1011 rv->inpartpopq = FALSE; 1012 } 1013 vm_reserv_domain_unlock(rv->domain); 1014 vm_reserv_break(rv); 1015 vm_reserv_unlock(rv); 1016 } 1017 } 1018 1019 /* 1020 * Frees the given page if it belongs to a reservation. Returns TRUE if the 1021 * page is freed and FALSE otherwise. 1022 */ 1023 boolean_t 1024 vm_reserv_free_page(vm_page_t m) 1025 { 1026 vm_reserv_t rv; 1027 boolean_t ret; 1028 1029 rv = vm_reserv_from_page(m); 1030 if (rv->object == NULL) 1031 return (FALSE); 1032 vm_reserv_lock(rv); 1033 /* Re-validate after lock. */ 1034 if (rv->object != NULL) { 1035 vm_reserv_depopulate(rv, m - rv->pages); 1036 ret = TRUE; 1037 } else 1038 ret = FALSE; 1039 vm_reserv_unlock(rv); 1040 1041 return (ret); 1042 } 1043 1044 /* 1045 * Initializes the reservation management system. Specifically, initializes 1046 * the reservation array. 1047 * 1048 * Requires that vm_page_array and first_page are initialized! 1049 */ 1050 void 1051 vm_reserv_init(void) 1052 { 1053 vm_paddr_t paddr; 1054 struct vm_phys_seg *seg; 1055 struct vm_reserv *rv; 1056 struct vm_reserv_domain *rvd; 1057 int i, j, segind; 1058 1059 /* 1060 * Initialize the reservation array. Specifically, initialize the 1061 * "pages" field for every element that has an underlying superpage. 1062 */ 1063 for (segind = 0; segind < vm_phys_nsegs; segind++) { 1064 seg = &vm_phys_segs[segind]; 1065 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); 1066 while (paddr + VM_LEVEL_0_SIZE > paddr && paddr + 1067 VM_LEVEL_0_SIZE <= seg->end) { 1068 rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT]; 1069 rv->pages = PHYS_TO_VM_PAGE(paddr); 1070 rv->domain = seg->domain; 1071 mtx_init(&rv->lock, "vm reserv", NULL, MTX_DEF); 1072 paddr += VM_LEVEL_0_SIZE; 1073 } 1074 } 1075 for (i = 0; i < MAXMEMDOM; i++) { 1076 rvd = &vm_rvd[i]; 1077 mtx_init(&rvd->lock, "vm reserv domain", NULL, MTX_DEF); 1078 TAILQ_INIT(&rvd->partpop); 1079 mtx_init(&rvd->marker.lock, "vm reserv marker", NULL, MTX_DEF); 1080 1081 /* 1082 * Fully populated reservations should never be present in the 1083 * partially populated reservation queues. 1084 */ 1085 rvd->marker.popcnt = VM_LEVEL_0_NPAGES; 1086 for (j = 0; j < NBPOPMAP; j++) 1087 popmap_set(rvd->marker.popmap, j); 1088 } 1089 1090 for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++) 1091 mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL, 1092 MTX_DEF); 1093 } 1094 1095 /* 1096 * Returns true if the given page belongs to a reservation and that page is 1097 * free. Otherwise, returns false. 1098 */ 1099 bool 1100 vm_reserv_is_page_free(vm_page_t m) 1101 { 1102 vm_reserv_t rv; 1103 1104 rv = vm_reserv_from_page(m); 1105 if (rv->object == NULL) 1106 return (false); 1107 return (popmap_is_clear(rv->popmap, m - rv->pages)); 1108 } 1109 1110 /* 1111 * If the given page belongs to a reservation, returns the level of that 1112 * reservation. Otherwise, returns -1. 1113 */ 1114 int 1115 vm_reserv_level(vm_page_t m) 1116 { 1117 vm_reserv_t rv; 1118 1119 rv = vm_reserv_from_page(m); 1120 return (rv->object != NULL ? 0 : -1); 1121 } 1122 1123 /* 1124 * Returns a reservation level if the given page belongs to a fully populated 1125 * reservation and -1 otherwise. 1126 */ 1127 int 1128 vm_reserv_level_iffullpop(vm_page_t m) 1129 { 1130 vm_reserv_t rv; 1131 1132 rv = vm_reserv_from_page(m); 1133 return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1); 1134 } 1135 1136 /* 1137 * Remove a partially populated reservation from the queue. 1138 */ 1139 static void 1140 vm_reserv_dequeue(vm_reserv_t rv) 1141 { 1142 1143 vm_reserv_domain_assert_locked(rv->domain); 1144 vm_reserv_assert_locked(rv); 1145 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 1146 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 1147 KASSERT(rv->inpartpopq, 1148 ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv)); 1149 1150 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); 1151 rv->inpartpopq = FALSE; 1152 } 1153 1154 /* 1155 * Breaks the given partially populated reservation, releasing its free pages 1156 * to the physical memory allocator. 1157 */ 1158 static void 1159 vm_reserv_reclaim(vm_reserv_t rv) 1160 { 1161 1162 vm_reserv_assert_locked(rv); 1163 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 1164 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 1165 if (rv->inpartpopq) { 1166 vm_reserv_domain_lock(rv->domain); 1167 vm_reserv_dequeue(rv); 1168 vm_reserv_domain_unlock(rv->domain); 1169 } 1170 vm_reserv_break(rv); 1171 counter_u64_add(vm_reserv_reclaimed, 1); 1172 } 1173 1174 /* 1175 * Breaks a reservation near the head of the partially populated reservation 1176 * queue, releasing its free pages to the physical memory allocator. Returns 1177 * TRUE if a reservation is broken and FALSE otherwise. 1178 */ 1179 bool 1180 vm_reserv_reclaim_inactive(int domain) 1181 { 1182 vm_reserv_t rv; 1183 1184 vm_reserv_domain_lock(domain); 1185 TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) { 1186 /* 1187 * A locked reservation is likely being updated or reclaimed, 1188 * so just skip ahead. 1189 */ 1190 if (rv != &vm_rvd[domain].marker && vm_reserv_trylock(rv)) { 1191 vm_reserv_dequeue(rv); 1192 break; 1193 } 1194 } 1195 vm_reserv_domain_unlock(domain); 1196 if (rv != NULL) { 1197 vm_reserv_reclaim(rv); 1198 vm_reserv_unlock(rv); 1199 return (true); 1200 } 1201 return (false); 1202 } 1203 1204 /* 1205 * Determine whether this reservation has free pages that satisfy the given 1206 * request for contiguous physical memory. Start searching from the lower 1207 * bound, defined by low_index. 1208 */ 1209 static bool 1210 vm_reserv_test_contig(vm_reserv_t rv, u_long npages, vm_paddr_t low, 1211 vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1212 { 1213 vm_paddr_t pa, size; 1214 u_long changes; 1215 int bitpos, bits_left, i, hi, lo, n; 1216 1217 vm_reserv_assert_locked(rv); 1218 size = npages << PAGE_SHIFT; 1219 pa = VM_PAGE_TO_PHYS(&rv->pages[0]); 1220 lo = (pa < low) ? 1221 ((low + PAGE_MASK - pa) >> PAGE_SHIFT) : 0; 1222 i = lo / NBPOPMAP; 1223 changes = rv->popmap[i] | ((1UL << (lo % NBPOPMAP)) - 1); 1224 hi = (pa + VM_LEVEL_0_SIZE > high) ? 1225 ((high + PAGE_MASK - pa) >> PAGE_SHIFT) : VM_LEVEL_0_NPAGES; 1226 n = hi / NBPOPMAP; 1227 bits_left = hi % NBPOPMAP; 1228 hi = lo = -1; 1229 for (;;) { 1230 /* 1231 * "changes" is a bitmask that marks where a new sequence of 1232 * 0s or 1s begins in popmap[i], with last bit in popmap[i-1] 1233 * considered to be 1 if and only if lo == hi. The bits of 1234 * popmap[-1] and popmap[NPOPMAP] are considered all 1s. 1235 */ 1236 changes ^= (changes << 1) | (lo == hi); 1237 while (changes != 0) { 1238 /* 1239 * If the next change marked begins a run of 0s, set 1240 * lo to mark that position. Otherwise set hi and 1241 * look for a satisfactory first page from lo up to hi. 1242 */ 1243 bitpos = ffsl(changes) - 1; 1244 changes ^= 1UL << bitpos; 1245 if (lo == hi) { 1246 lo = NBPOPMAP * i + bitpos; 1247 continue; 1248 } 1249 hi = NBPOPMAP * i + bitpos; 1250 pa = VM_PAGE_TO_PHYS(&rv->pages[lo]); 1251 if ((pa & (alignment - 1)) != 0) { 1252 /* Skip to next aligned page. */ 1253 lo += (((pa - 1) | (alignment - 1)) + 1) >> 1254 PAGE_SHIFT; 1255 if (lo >= VM_LEVEL_0_NPAGES) 1256 return (false); 1257 pa = VM_PAGE_TO_PHYS(&rv->pages[lo]); 1258 } 1259 if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) { 1260 /* Skip to next boundary-matching page. */ 1261 lo += (((pa - 1) | (boundary - 1)) + 1) >> 1262 PAGE_SHIFT; 1263 if (lo >= VM_LEVEL_0_NPAGES) 1264 return (false); 1265 pa = VM_PAGE_TO_PHYS(&rv->pages[lo]); 1266 } 1267 if (lo * PAGE_SIZE + size <= hi * PAGE_SIZE) 1268 return (true); 1269 lo = hi; 1270 } 1271 if (++i < n) 1272 changes = rv->popmap[i]; 1273 else if (i == n) 1274 changes = bits_left == 0 ? -1UL : 1275 (rv->popmap[n] | (-1UL << bits_left)); 1276 else 1277 return (false); 1278 } 1279 } 1280 1281 /* 1282 * Searches the partially populated reservation queue for the least recently 1283 * changed reservation with free pages that satisfy the given request for 1284 * contiguous physical memory. If a satisfactory reservation is found, it is 1285 * broken. Returns true if a reservation is broken and false otherwise. 1286 */ 1287 bool 1288 vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low, 1289 vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1290 { 1291 struct vm_reserv_queue *queue; 1292 vm_paddr_t pa, size; 1293 vm_reserv_t marker, rv, rvn; 1294 1295 if (npages > VM_LEVEL_0_NPAGES - 1) 1296 return (false); 1297 marker = &vm_rvd[domain].marker; 1298 queue = &vm_rvd[domain].partpop; 1299 size = npages << PAGE_SHIFT; 1300 1301 vm_reserv_domain_scan_lock(domain); 1302 vm_reserv_domain_lock(domain); 1303 TAILQ_FOREACH_SAFE(rv, queue, partpopq, rvn) { 1304 pa = VM_PAGE_TO_PHYS(&rv->pages[0]); 1305 if (pa + VM_LEVEL_0_SIZE - size < low) { 1306 /* This entire reservation is too low; go to next. */ 1307 continue; 1308 } 1309 if (pa + size > high) { 1310 /* This entire reservation is too high; go to next. */ 1311 continue; 1312 } 1313 1314 if (vm_reserv_trylock(rv) == 0) { 1315 TAILQ_INSERT_AFTER(queue, rv, marker, partpopq); 1316 vm_reserv_domain_unlock(domain); 1317 vm_reserv_lock(rv); 1318 if (!rv->inpartpopq || 1319 TAILQ_NEXT(rv, partpopq) != marker) { 1320 vm_reserv_unlock(rv); 1321 vm_reserv_domain_lock(domain); 1322 rvn = TAILQ_NEXT(marker, partpopq); 1323 TAILQ_REMOVE(queue, marker, partpopq); 1324 continue; 1325 } 1326 vm_reserv_domain_lock(domain); 1327 TAILQ_REMOVE(queue, marker, partpopq); 1328 } 1329 vm_reserv_domain_unlock(domain); 1330 if (vm_reserv_test_contig(rv, npages, low, high, 1331 alignment, boundary)) { 1332 vm_reserv_domain_scan_unlock(domain); 1333 vm_reserv_reclaim(rv); 1334 vm_reserv_unlock(rv); 1335 return (true); 1336 } 1337 vm_reserv_unlock(rv); 1338 vm_reserv_domain_lock(domain); 1339 } 1340 vm_reserv_domain_unlock(domain); 1341 vm_reserv_domain_scan_unlock(domain); 1342 return (false); 1343 } 1344 1345 /* 1346 * Transfers the reservation underlying the given page to a new object. 1347 * 1348 * The object must be locked. 1349 */ 1350 void 1351 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, 1352 vm_pindex_t old_object_offset) 1353 { 1354 vm_reserv_t rv; 1355 1356 VM_OBJECT_ASSERT_WLOCKED(new_object); 1357 rv = vm_reserv_from_page(m); 1358 if (rv->object == old_object) { 1359 vm_reserv_lock(rv); 1360 CTR6(KTR_VM, 1361 "%s: rv %p object %p new %p popcnt %d inpartpop %d", 1362 __FUNCTION__, rv, rv->object, new_object, rv->popcnt, 1363 rv->inpartpopq); 1364 if (rv->object == old_object) { 1365 vm_reserv_object_lock(old_object); 1366 rv->object = NULL; 1367 LIST_REMOVE(rv, objq); 1368 vm_reserv_object_unlock(old_object); 1369 vm_reserv_object_lock(new_object); 1370 rv->object = new_object; 1371 rv->pindex -= old_object_offset; 1372 LIST_INSERT_HEAD(&new_object->rvq, rv, objq); 1373 vm_reserv_object_unlock(new_object); 1374 } 1375 vm_reserv_unlock(rv); 1376 } 1377 } 1378 1379 /* 1380 * Returns the size (in bytes) of a reservation of the specified level. 1381 */ 1382 int 1383 vm_reserv_size(int level) 1384 { 1385 1386 switch (level) { 1387 case 0: 1388 return (VM_LEVEL_0_SIZE); 1389 case -1: 1390 return (PAGE_SIZE); 1391 default: 1392 return (0); 1393 } 1394 } 1395 1396 /* 1397 * Allocates the virtual and physical memory required by the reservation 1398 * management system's data structures, in particular, the reservation array. 1399 */ 1400 vm_paddr_t 1401 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end) 1402 { 1403 vm_paddr_t new_end, high_water; 1404 size_t size; 1405 int i; 1406 1407 high_water = phys_avail[1]; 1408 for (i = 0; i < vm_phys_nsegs; i++) { 1409 if (vm_phys_segs[i].end > high_water) 1410 high_water = vm_phys_segs[i].end; 1411 } 1412 1413 /* Skip the first chunk. It is already accounted for. */ 1414 for (i = 2; phys_avail[i + 1] != 0; i += 2) { 1415 if (phys_avail[i + 1] > high_water) 1416 high_water = phys_avail[i + 1]; 1417 } 1418 1419 /* 1420 * Calculate the size (in bytes) of the reservation array. Round up 1421 * from "high_water" because every small page is mapped to an element 1422 * in the reservation array based on its physical address. Thus, the 1423 * number of elements in the reservation array can be greater than the 1424 * number of superpages. 1425 */ 1426 size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv); 1427 1428 /* 1429 * Allocate and map the physical memory for the reservation array. The 1430 * next available virtual address is returned by reference. 1431 */ 1432 new_end = end - round_page(size); 1433 vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end, 1434 VM_PROT_READ | VM_PROT_WRITE); 1435 bzero(vm_reserv_array, size); 1436 1437 /* 1438 * Return the next available physical address. 1439 */ 1440 return (new_end); 1441 } 1442 1443 /* 1444 * Returns the superpage containing the given page. 1445 */ 1446 vm_page_t 1447 vm_reserv_to_superpage(vm_page_t m) 1448 { 1449 vm_reserv_t rv; 1450 1451 VM_OBJECT_ASSERT_LOCKED(m->object); 1452 rv = vm_reserv_from_page(m); 1453 if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES) 1454 m = rv->pages; 1455 else 1456 m = NULL; 1457 1458 return (m); 1459 } 1460 1461 #endif /* VM_NRESERVLEVEL > 0 */ 1462