1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2006 Rice University 5 * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu> 6 * All rights reserved. 7 * 8 * This software was developed for the FreeBSD Project by Alan L. Cox, 9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Superpage reservation management module 36 * 37 * Any external functions defined by this module are only to be used by the 38 * virtual memory system. 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_vm.h" 45 46 #include <sys/param.h> 47 #include <sys/kernel.h> 48 #include <sys/lock.h> 49 #include <sys/malloc.h> 50 #include <sys/mutex.h> 51 #include <sys/queue.h> 52 #include <sys/rwlock.h> 53 #include <sys/sbuf.h> 54 #include <sys/sysctl.h> 55 #include <sys/systm.h> 56 #include <sys/counter.h> 57 #include <sys/ktr.h> 58 #include <sys/vmmeter.h> 59 #include <sys/smp.h> 60 61 #include <vm/vm.h> 62 #include <vm/vm_param.h> 63 #include <vm/vm_object.h> 64 #include <vm/vm_page.h> 65 #include <vm/vm_pageout.h> 66 #include <vm/vm_pagequeue.h> 67 #include <vm/vm_phys.h> 68 #include <vm/vm_radix.h> 69 #include <vm/vm_reserv.h> 70 71 /* 72 * The reservation system supports the speculative allocation of large physical 73 * pages ("superpages"). Speculative allocation enables the fully automatic 74 * utilization of superpages by the virtual memory system. In other words, no 75 * programmatic directives are required to use superpages. 76 */ 77 78 #if VM_NRESERVLEVEL > 0 79 80 #ifndef VM_LEVEL_0_ORDER_MAX 81 #define VM_LEVEL_0_ORDER_MAX VM_LEVEL_0_ORDER 82 #endif 83 84 /* 85 * The number of small pages that are contained in a level 0 reservation 86 */ 87 #define VM_LEVEL_0_NPAGES (1 << VM_LEVEL_0_ORDER) 88 #define VM_LEVEL_0_NPAGES_MAX (1 << VM_LEVEL_0_ORDER_MAX) 89 90 /* 91 * The number of bits by which a physical address is shifted to obtain the 92 * reservation number 93 */ 94 #define VM_LEVEL_0_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) 95 96 /* 97 * The size of a level 0 reservation in bytes 98 */ 99 #define VM_LEVEL_0_SIZE (1 << VM_LEVEL_0_SHIFT) 100 101 /* 102 * Computes the index of the small page underlying the given (object, pindex) 103 * within the reservation's array of small pages. 104 */ 105 #define VM_RESERV_INDEX(object, pindex) \ 106 (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1)) 107 108 /* 109 * The size of a population map entry 110 */ 111 typedef u_long popmap_t; 112 113 /* 114 * The number of bits in a population map entry 115 */ 116 #define NBPOPMAP (NBBY * sizeof(popmap_t)) 117 118 /* 119 * The number of population map entries in a reservation 120 */ 121 #define NPOPMAP howmany(VM_LEVEL_0_NPAGES, NBPOPMAP) 122 #define NPOPMAP_MAX howmany(VM_LEVEL_0_NPAGES_MAX, NBPOPMAP) 123 124 /* 125 * Number of elapsed ticks before we update the LRU queue position. Used 126 * to reduce contention and churn on the list. 127 */ 128 #define PARTPOPSLOP 1 129 130 /* 131 * Clear a bit in the population map. 132 */ 133 static __inline void 134 popmap_clear(popmap_t popmap[], int i) 135 { 136 137 popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP)); 138 } 139 140 /* 141 * Set a bit in the population map. 142 */ 143 static __inline void 144 popmap_set(popmap_t popmap[], int i) 145 { 146 147 popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP); 148 } 149 150 /* 151 * Is a bit in the population map clear? 152 */ 153 static __inline boolean_t 154 popmap_is_clear(popmap_t popmap[], int i) 155 { 156 157 return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0); 158 } 159 160 /* 161 * Is a bit in the population map set? 162 */ 163 static __inline boolean_t 164 popmap_is_set(popmap_t popmap[], int i) 165 { 166 167 return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0); 168 } 169 170 /* 171 * The reservation structure 172 * 173 * A reservation structure is constructed whenever a large physical page is 174 * speculatively allocated to an object. The reservation provides the small 175 * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets 176 * within that object. The reservation's "popcnt" tracks the number of these 177 * small physical pages that are in use at any given time. When and if the 178 * reservation is not fully utilized, it appears in the queue of partially 179 * populated reservations. The reservation always appears on the containing 180 * object's list of reservations. 181 * 182 * A partially populated reservation can be broken and reclaimed at any time. 183 * 184 * c - constant after boot 185 * d - vm_reserv_domain_lock 186 * o - vm_reserv_object_lock 187 * r - vm_reserv_lock 188 * s - vm_reserv_domain_scan_lock 189 */ 190 struct vm_reserv { 191 struct mtx lock; /* reservation lock. */ 192 TAILQ_ENTRY(vm_reserv) partpopq; /* (d, r) per-domain queue. */ 193 LIST_ENTRY(vm_reserv) objq; /* (o, r) object queue */ 194 vm_object_t object; /* (o, r) containing object */ 195 vm_pindex_t pindex; /* (o, r) offset in object */ 196 vm_page_t pages; /* (c) first page */ 197 uint16_t popcnt; /* (r) # of pages in use */ 198 uint8_t domain; /* (c) NUMA domain. */ 199 char inpartpopq; /* (d, r) */ 200 int lasttick; /* (r) last pop update tick. */ 201 popmap_t popmap[NPOPMAP_MAX]; /* (r) bit vector, used pages */ 202 }; 203 204 TAILQ_HEAD(vm_reserv_queue, vm_reserv); 205 206 #define vm_reserv_lockptr(rv) (&(rv)->lock) 207 #define vm_reserv_assert_locked(rv) \ 208 mtx_assert(vm_reserv_lockptr(rv), MA_OWNED) 209 #define vm_reserv_lock(rv) mtx_lock(vm_reserv_lockptr(rv)) 210 #define vm_reserv_trylock(rv) mtx_trylock(vm_reserv_lockptr(rv)) 211 #define vm_reserv_unlock(rv) mtx_unlock(vm_reserv_lockptr(rv)) 212 213 /* 214 * The reservation array 215 * 216 * This array is analoguous in function to vm_page_array. It differs in the 217 * respect that it may contain a greater number of useful reservation 218 * structures than there are (physical) superpages. These "invalid" 219 * reservation structures exist to trade-off space for time in the 220 * implementation of vm_reserv_from_page(). Invalid reservation structures are 221 * distinguishable from "valid" reservation structures by inspecting the 222 * reservation's "pages" field. Invalid reservation structures have a NULL 223 * "pages" field. 224 * 225 * vm_reserv_from_page() maps a small (physical) page to an element of this 226 * array by computing a physical reservation number from the page's physical 227 * address. The physical reservation number is used as the array index. 228 * 229 * An "active" reservation is a valid reservation structure that has a non-NULL 230 * "object" field and a non-zero "popcnt" field. In other words, every active 231 * reservation belongs to a particular object. Moreover, every active 232 * reservation has an entry in the containing object's list of reservations. 233 */ 234 static vm_reserv_t vm_reserv_array; 235 236 /* 237 * The per-domain partially populated reservation queues 238 * 239 * These queues enable the fast recovery of an unused free small page from a 240 * partially populated reservation. The reservation at the head of a queue 241 * is the least recently changed, partially populated reservation. 242 * 243 * Access to this queue is synchronized by the per-domain reservation lock. 244 * Threads reclaiming free pages from the queue must hold the per-domain scan 245 * lock. 246 */ 247 struct vm_reserv_domain { 248 struct mtx lock; 249 struct vm_reserv_queue partpop; /* (d) */ 250 struct vm_reserv marker; /* (d, s) scan marker/lock */ 251 } __aligned(CACHE_LINE_SIZE); 252 253 static struct vm_reserv_domain vm_rvd[MAXMEMDOM]; 254 255 #define vm_reserv_domain_lockptr(d) (&vm_rvd[(d)].lock) 256 #define vm_reserv_domain_assert_locked(d) \ 257 mtx_assert(vm_reserv_domain_lockptr(d), MA_OWNED) 258 #define vm_reserv_domain_lock(d) mtx_lock(vm_reserv_domain_lockptr(d)) 259 #define vm_reserv_domain_unlock(d) mtx_unlock(vm_reserv_domain_lockptr(d)) 260 261 #define vm_reserv_domain_scan_lock(d) mtx_lock(&vm_rvd[(d)].marker.lock) 262 #define vm_reserv_domain_scan_unlock(d) mtx_unlock(&vm_rvd[(d)].marker.lock) 263 264 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 265 "Reservation Info"); 266 267 static COUNTER_U64_DEFINE_EARLY(vm_reserv_broken); 268 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD, 269 &vm_reserv_broken, "Cumulative number of broken reservations"); 270 271 static COUNTER_U64_DEFINE_EARLY(vm_reserv_freed); 272 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD, 273 &vm_reserv_freed, "Cumulative number of freed reservations"); 274 275 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS); 276 277 SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, 278 NULL, 0, sysctl_vm_reserv_fullpop, "I", "Current number of full reservations"); 279 280 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS); 281 282 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, 283 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 284 sysctl_vm_reserv_partpopq, "A", 285 "Partially populated reservation queues"); 286 287 static COUNTER_U64_DEFINE_EARLY(vm_reserv_reclaimed); 288 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD, 289 &vm_reserv_reclaimed, "Cumulative number of reclaimed reservations"); 290 291 /* 292 * The object lock pool is used to synchronize the rvq. We can not use a 293 * pool mutex because it is required before malloc works. 294 * 295 * The "hash" function could be made faster without divide and modulo. 296 */ 297 #define VM_RESERV_OBJ_LOCK_COUNT MAXCPU 298 299 struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT]; 300 301 #define vm_reserv_object_lock_idx(object) \ 302 (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT) 303 #define vm_reserv_object_lock_ptr(object) \ 304 &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))] 305 #define vm_reserv_object_lock(object) \ 306 mtx_lock(vm_reserv_object_lock_ptr((object))) 307 #define vm_reserv_object_unlock(object) \ 308 mtx_unlock(vm_reserv_object_lock_ptr((object))) 309 310 static void vm_reserv_break(vm_reserv_t rv); 311 static void vm_reserv_depopulate(vm_reserv_t rv, int index); 312 static vm_reserv_t vm_reserv_from_page(vm_page_t m); 313 static boolean_t vm_reserv_has_pindex(vm_reserv_t rv, 314 vm_pindex_t pindex); 315 static void vm_reserv_populate(vm_reserv_t rv, int index); 316 static void vm_reserv_reclaim(vm_reserv_t rv); 317 318 /* 319 * Returns the current number of full reservations. 320 * 321 * Since the number of full reservations is computed without acquiring any 322 * locks, the returned value is inexact. 323 */ 324 static int 325 sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS) 326 { 327 vm_paddr_t paddr; 328 struct vm_phys_seg *seg; 329 vm_reserv_t rv; 330 int fullpop, segind; 331 332 fullpop = 0; 333 for (segind = 0; segind < vm_phys_nsegs; segind++) { 334 seg = &vm_phys_segs[segind]; 335 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); 336 #ifdef VM_PHYSSEG_SPARSE 337 rv = seg->first_reserv + (paddr >> VM_LEVEL_0_SHIFT) - 338 (seg->start >> VM_LEVEL_0_SHIFT); 339 #else 340 rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT]; 341 #endif 342 while (paddr + VM_LEVEL_0_SIZE > paddr && paddr + 343 VM_LEVEL_0_SIZE <= seg->end) { 344 fullpop += rv->popcnt == VM_LEVEL_0_NPAGES; 345 paddr += VM_LEVEL_0_SIZE; 346 rv++; 347 } 348 } 349 return (sysctl_handle_int(oidp, &fullpop, 0, req)); 350 } 351 352 /* 353 * Describes the current state of the partially populated reservation queue. 354 */ 355 static int 356 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS) 357 { 358 struct sbuf sbuf; 359 vm_reserv_t rv; 360 int counter, error, domain, level, unused_pages; 361 362 error = sysctl_wire_old_buffer(req, 0); 363 if (error != 0) 364 return (error); 365 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 366 sbuf_printf(&sbuf, "\nDOMAIN LEVEL SIZE NUMBER\n\n"); 367 for (domain = 0; domain < vm_ndomains; domain++) { 368 for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) { 369 counter = 0; 370 unused_pages = 0; 371 vm_reserv_domain_lock(domain); 372 TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) { 373 if (rv == &vm_rvd[domain].marker) 374 continue; 375 counter++; 376 unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt; 377 } 378 vm_reserv_domain_unlock(domain); 379 sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n", 380 domain, level, 381 unused_pages * ((int)PAGE_SIZE / 1024), counter); 382 } 383 } 384 error = sbuf_finish(&sbuf); 385 sbuf_delete(&sbuf); 386 return (error); 387 } 388 389 /* 390 * Remove a reservation from the object's objq. 391 */ 392 static void 393 vm_reserv_remove(vm_reserv_t rv) 394 { 395 vm_object_t object; 396 397 vm_reserv_assert_locked(rv); 398 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 399 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 400 KASSERT(rv->object != NULL, 401 ("vm_reserv_remove: reserv %p is free", rv)); 402 KASSERT(!rv->inpartpopq, 403 ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv)); 404 object = rv->object; 405 vm_reserv_object_lock(object); 406 LIST_REMOVE(rv, objq); 407 rv->object = NULL; 408 vm_reserv_object_unlock(object); 409 } 410 411 /* 412 * Insert a new reservation into the object's objq. 413 */ 414 static void 415 vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex) 416 { 417 int i; 418 419 vm_reserv_assert_locked(rv); 420 CTR6(KTR_VM, 421 "%s: rv %p(%p) object %p new %p popcnt %d", 422 __FUNCTION__, rv, rv->pages, rv->object, object, 423 rv->popcnt); 424 KASSERT(rv->object == NULL, 425 ("vm_reserv_insert: reserv %p isn't free", rv)); 426 KASSERT(rv->popcnt == 0, 427 ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv)); 428 KASSERT(!rv->inpartpopq, 429 ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv)); 430 for (i = 0; i < NPOPMAP; i++) 431 KASSERT(rv->popmap[i] == 0, 432 ("vm_reserv_insert: reserv %p's popmap is corrupted", rv)); 433 vm_reserv_object_lock(object); 434 rv->pindex = pindex; 435 rv->object = object; 436 rv->lasttick = ticks; 437 LIST_INSERT_HEAD(&object->rvq, rv, objq); 438 vm_reserv_object_unlock(object); 439 } 440 441 /* 442 * Reduces the given reservation's population count. If the population count 443 * becomes zero, the reservation is destroyed. Additionally, moves the 444 * reservation to the tail of the partially populated reservation queue if the 445 * population count is non-zero. 446 */ 447 static void 448 vm_reserv_depopulate(vm_reserv_t rv, int index) 449 { 450 struct vm_domain *vmd; 451 452 vm_reserv_assert_locked(rv); 453 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 454 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 455 KASSERT(rv->object != NULL, 456 ("vm_reserv_depopulate: reserv %p is free", rv)); 457 KASSERT(popmap_is_set(rv->popmap, index), 458 ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv, 459 index)); 460 KASSERT(rv->popcnt > 0, 461 ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv)); 462 KASSERT(rv->domain < vm_ndomains, 463 ("vm_reserv_depopulate: reserv %p's domain is corrupted %d", 464 rv, rv->domain)); 465 if (rv->popcnt == VM_LEVEL_0_NPAGES) { 466 KASSERT(rv->pages->psind == 1, 467 ("vm_reserv_depopulate: reserv %p is already demoted", 468 rv)); 469 rv->pages->psind = 0; 470 } 471 popmap_clear(rv->popmap, index); 472 rv->popcnt--; 473 if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP || 474 rv->popcnt == 0) { 475 vm_reserv_domain_lock(rv->domain); 476 if (rv->inpartpopq) { 477 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); 478 rv->inpartpopq = FALSE; 479 } 480 if (rv->popcnt != 0) { 481 rv->inpartpopq = TRUE; 482 TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, 483 partpopq); 484 } 485 vm_reserv_domain_unlock(rv->domain); 486 rv->lasttick = ticks; 487 } 488 vmd = VM_DOMAIN(rv->domain); 489 if (rv->popcnt == 0) { 490 vm_reserv_remove(rv); 491 vm_domain_free_lock(vmd); 492 vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER); 493 vm_domain_free_unlock(vmd); 494 counter_u64_add(vm_reserv_freed, 1); 495 } 496 vm_domain_freecnt_inc(vmd, 1); 497 } 498 499 /* 500 * Returns the reservation to which the given page might belong. 501 */ 502 static __inline vm_reserv_t 503 vm_reserv_from_page(vm_page_t m) 504 { 505 #ifdef VM_PHYSSEG_SPARSE 506 struct vm_phys_seg *seg; 507 508 seg = &vm_phys_segs[m->segind]; 509 return (seg->first_reserv + (VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT) - 510 (seg->start >> VM_LEVEL_0_SHIFT)); 511 #else 512 return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]); 513 #endif 514 } 515 516 /* 517 * Returns an existing reservation or NULL and initialized successor pointer. 518 */ 519 static vm_reserv_t 520 vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex, 521 vm_page_t mpred, vm_page_t *msuccp) 522 { 523 vm_reserv_t rv; 524 vm_page_t msucc; 525 526 msucc = NULL; 527 if (mpred != NULL) { 528 KASSERT(mpred->object == object, 529 ("vm_reserv_from_object: object doesn't contain mpred")); 530 KASSERT(mpred->pindex < pindex, 531 ("vm_reserv_from_object: mpred doesn't precede pindex")); 532 rv = vm_reserv_from_page(mpred); 533 if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) 534 goto found; 535 msucc = TAILQ_NEXT(mpred, listq); 536 } else 537 msucc = TAILQ_FIRST(&object->memq); 538 if (msucc != NULL) { 539 KASSERT(msucc->pindex > pindex, 540 ("vm_reserv_from_object: msucc doesn't succeed pindex")); 541 rv = vm_reserv_from_page(msucc); 542 if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) 543 goto found; 544 } 545 rv = NULL; 546 547 found: 548 *msuccp = msucc; 549 550 return (rv); 551 } 552 553 /* 554 * Returns TRUE if the given reservation contains the given page index and 555 * FALSE otherwise. 556 */ 557 static __inline boolean_t 558 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex) 559 { 560 561 return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0); 562 } 563 564 /* 565 * Increases the given reservation's population count. Moves the reservation 566 * to the tail of the partially populated reservation queue. 567 */ 568 static void 569 vm_reserv_populate(vm_reserv_t rv, int index) 570 { 571 572 vm_reserv_assert_locked(rv); 573 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 574 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 575 KASSERT(rv->object != NULL, 576 ("vm_reserv_populate: reserv %p is free", rv)); 577 KASSERT(popmap_is_clear(rv->popmap, index), 578 ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv, 579 index)); 580 KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES, 581 ("vm_reserv_populate: reserv %p is already full", rv)); 582 KASSERT(rv->pages->psind == 0, 583 ("vm_reserv_populate: reserv %p is already promoted", rv)); 584 KASSERT(rv->domain < vm_ndomains, 585 ("vm_reserv_populate: reserv %p's domain is corrupted %d", 586 rv, rv->domain)); 587 popmap_set(rv->popmap, index); 588 rv->popcnt++; 589 if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP && 590 rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES) 591 return; 592 rv->lasttick = ticks; 593 vm_reserv_domain_lock(rv->domain); 594 if (rv->inpartpopq) { 595 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); 596 rv->inpartpopq = FALSE; 597 } 598 if (rv->popcnt < VM_LEVEL_0_NPAGES) { 599 rv->inpartpopq = TRUE; 600 TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq); 601 } else { 602 KASSERT(rv->pages->psind == 0, 603 ("vm_reserv_populate: reserv %p is already promoted", 604 rv)); 605 rv->pages->psind = 1; 606 } 607 vm_reserv_domain_unlock(rv->domain); 608 } 609 610 /* 611 * Allocates a contiguous set of physical pages of the given size "npages" 612 * from existing or newly created reservations. All of the physical pages 613 * must be at or above the given physical address "low" and below the given 614 * physical address "high". The given value "alignment" determines the 615 * alignment of the first physical page in the set. If the given value 616 * "boundary" is non-zero, then the set of physical pages cannot cross any 617 * physical address boundary that is a multiple of that value. Both 618 * "alignment" and "boundary" must be a power of two. 619 * 620 * The page "mpred" must immediately precede the offset "pindex" within the 621 * specified object. 622 * 623 * The object must be locked. 624 */ 625 vm_page_t 626 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain, 627 int req, vm_page_t mpred, u_long npages, vm_paddr_t low, vm_paddr_t high, 628 u_long alignment, vm_paddr_t boundary) 629 { 630 struct vm_domain *vmd; 631 vm_paddr_t pa, size; 632 vm_page_t m, m_ret, msucc; 633 vm_pindex_t first, leftcap, rightcap; 634 vm_reserv_t rv; 635 u_long allocpages, maxpages, minpages; 636 int i, index, n; 637 638 VM_OBJECT_ASSERT_WLOCKED(object); 639 KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0")); 640 641 /* 642 * Is a reservation fundamentally impossible? 643 */ 644 if (pindex < VM_RESERV_INDEX(object, pindex) || 645 pindex + npages > object->size) 646 return (NULL); 647 648 /* 649 * All reservations of a particular size have the same alignment. 650 * Assuming that the first page is allocated from a reservation, the 651 * least significant bits of its physical address can be determined 652 * from its offset from the beginning of the reservation and the size 653 * of the reservation. 654 * 655 * Could the specified index within a reservation of the smallest 656 * possible size satisfy the alignment and boundary requirements? 657 */ 658 pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT; 659 if ((pa & (alignment - 1)) != 0) 660 return (NULL); 661 size = npages << PAGE_SHIFT; 662 if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) 663 return (NULL); 664 665 /* 666 * Look for an existing reservation. 667 */ 668 rv = vm_reserv_from_object(object, pindex, mpred, &msucc); 669 if (rv != NULL) { 670 KASSERT(object != kernel_object || rv->domain == domain, 671 ("vm_reserv_alloc_contig: domain mismatch")); 672 index = VM_RESERV_INDEX(object, pindex); 673 /* Does the allocation fit within the reservation? */ 674 if (index + npages > VM_LEVEL_0_NPAGES) 675 return (NULL); 676 domain = rv->domain; 677 vmd = VM_DOMAIN(domain); 678 vm_reserv_lock(rv); 679 /* Handle reclaim race. */ 680 if (rv->object != object) 681 goto out; 682 m = &rv->pages[index]; 683 pa = VM_PAGE_TO_PHYS(m); 684 if (pa < low || pa + size > high || 685 (pa & (alignment - 1)) != 0 || 686 ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) 687 goto out; 688 /* Handle vm_page_rename(m, new_object, ...). */ 689 for (i = 0; i < npages; i++) 690 if (popmap_is_set(rv->popmap, index + i)) 691 goto out; 692 if (!vm_domain_allocate(vmd, req, npages)) 693 goto out; 694 for (i = 0; i < npages; i++) 695 vm_reserv_populate(rv, index + i); 696 vm_reserv_unlock(rv); 697 return (m); 698 out: 699 vm_reserv_unlock(rv); 700 return (NULL); 701 } 702 703 /* 704 * Could at least one reservation fit between the first index to the 705 * left that can be used ("leftcap") and the first index to the right 706 * that cannot be used ("rightcap")? 707 * 708 * We must synchronize with the reserv object lock to protect the 709 * pindex/object of the resulting reservations against rename while 710 * we are inspecting. 711 */ 712 first = pindex - VM_RESERV_INDEX(object, pindex); 713 minpages = VM_RESERV_INDEX(object, pindex) + npages; 714 maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES); 715 allocpages = maxpages; 716 vm_reserv_object_lock(object); 717 if (mpred != NULL) { 718 if ((rv = vm_reserv_from_page(mpred))->object != object) 719 leftcap = mpred->pindex + 1; 720 else 721 leftcap = rv->pindex + VM_LEVEL_0_NPAGES; 722 if (leftcap > first) { 723 vm_reserv_object_unlock(object); 724 return (NULL); 725 } 726 } 727 if (msucc != NULL) { 728 if ((rv = vm_reserv_from_page(msucc))->object != object) 729 rightcap = msucc->pindex; 730 else 731 rightcap = rv->pindex; 732 if (first + maxpages > rightcap) { 733 if (maxpages == VM_LEVEL_0_NPAGES) { 734 vm_reserv_object_unlock(object); 735 return (NULL); 736 } 737 738 /* 739 * At least one reservation will fit between "leftcap" 740 * and "rightcap". However, a reservation for the 741 * last of the requested pages will not fit. Reduce 742 * the size of the upcoming allocation accordingly. 743 */ 744 allocpages = minpages; 745 } 746 } 747 vm_reserv_object_unlock(object); 748 749 /* 750 * Would the last new reservation extend past the end of the object? 751 * 752 * If the object is unlikely to grow don't allocate a reservation for 753 * the tail. 754 */ 755 if ((object->flags & OBJ_ANON) == 0 && 756 first + maxpages > object->size) { 757 if (maxpages == VM_LEVEL_0_NPAGES) 758 return (NULL); 759 allocpages = minpages; 760 } 761 762 /* 763 * Allocate the physical pages. The alignment and boundary specified 764 * for this allocation may be different from the alignment and 765 * boundary specified for the requested pages. For instance, the 766 * specified index may not be the first page within the first new 767 * reservation. 768 */ 769 m = NULL; 770 vmd = VM_DOMAIN(domain); 771 if (vm_domain_allocate(vmd, req, npages)) { 772 vm_domain_free_lock(vmd); 773 m = vm_phys_alloc_contig(domain, allocpages, low, high, 774 ulmax(alignment, VM_LEVEL_0_SIZE), 775 boundary > VM_LEVEL_0_SIZE ? boundary : 0); 776 vm_domain_free_unlock(vmd); 777 if (m == NULL) { 778 vm_domain_freecnt_inc(vmd, npages); 779 return (NULL); 780 } 781 } else 782 return (NULL); 783 KASSERT(vm_page_domain(m) == domain, 784 ("vm_reserv_alloc_contig: Page domain does not match requested.")); 785 786 /* 787 * The allocated physical pages always begin at a reservation 788 * boundary, but they do not always end at a reservation boundary. 789 * Initialize every reservation that is completely covered by the 790 * allocated physical pages. 791 */ 792 m_ret = NULL; 793 index = VM_RESERV_INDEX(object, pindex); 794 do { 795 rv = vm_reserv_from_page(m); 796 KASSERT(rv->pages == m, 797 ("vm_reserv_alloc_contig: reserv %p's pages is corrupted", 798 rv)); 799 vm_reserv_lock(rv); 800 vm_reserv_insert(rv, object, first); 801 n = ulmin(VM_LEVEL_0_NPAGES - index, npages); 802 for (i = 0; i < n; i++) 803 vm_reserv_populate(rv, index + i); 804 npages -= n; 805 if (m_ret == NULL) { 806 m_ret = &rv->pages[index]; 807 index = 0; 808 } 809 vm_reserv_unlock(rv); 810 m += VM_LEVEL_0_NPAGES; 811 first += VM_LEVEL_0_NPAGES; 812 allocpages -= VM_LEVEL_0_NPAGES; 813 } while (allocpages >= VM_LEVEL_0_NPAGES); 814 return (m_ret); 815 } 816 817 /* 818 * Allocate a physical page from an existing or newly created reservation. 819 * 820 * The page "mpred" must immediately precede the offset "pindex" within the 821 * specified object. 822 * 823 * The object must be locked. 824 */ 825 vm_page_t 826 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain, 827 int req, vm_page_t mpred) 828 { 829 struct vm_domain *vmd; 830 vm_page_t m, msucc; 831 vm_pindex_t first, leftcap, rightcap; 832 vm_reserv_t rv; 833 int index; 834 835 VM_OBJECT_ASSERT_WLOCKED(object); 836 837 /* 838 * Is a reservation fundamentally impossible? 839 */ 840 if (pindex < VM_RESERV_INDEX(object, pindex) || 841 pindex >= object->size) 842 return (NULL); 843 844 /* 845 * Look for an existing reservation. 846 */ 847 rv = vm_reserv_from_object(object, pindex, mpred, &msucc); 848 if (rv != NULL) { 849 KASSERT(object != kernel_object || rv->domain == domain, 850 ("vm_reserv_alloc_page: domain mismatch")); 851 domain = rv->domain; 852 vmd = VM_DOMAIN(domain); 853 index = VM_RESERV_INDEX(object, pindex); 854 m = &rv->pages[index]; 855 vm_reserv_lock(rv); 856 /* Handle reclaim race. */ 857 if (rv->object != object || 858 /* Handle vm_page_rename(m, new_object, ...). */ 859 popmap_is_set(rv->popmap, index)) { 860 m = NULL; 861 goto out; 862 } 863 if (vm_domain_allocate(vmd, req, 1) == 0) 864 m = NULL; 865 else 866 vm_reserv_populate(rv, index); 867 out: 868 vm_reserv_unlock(rv); 869 return (m); 870 } 871 872 /* 873 * Could a reservation fit between the first index to the left that 874 * can be used and the first index to the right that cannot be used? 875 * 876 * We must synchronize with the reserv object lock to protect the 877 * pindex/object of the resulting reservations against rename while 878 * we are inspecting. 879 */ 880 first = pindex - VM_RESERV_INDEX(object, pindex); 881 vm_reserv_object_lock(object); 882 if (mpred != NULL) { 883 if ((rv = vm_reserv_from_page(mpred))->object != object) 884 leftcap = mpred->pindex + 1; 885 else 886 leftcap = rv->pindex + VM_LEVEL_0_NPAGES; 887 if (leftcap > first) { 888 vm_reserv_object_unlock(object); 889 return (NULL); 890 } 891 } 892 if (msucc != NULL) { 893 if ((rv = vm_reserv_from_page(msucc))->object != object) 894 rightcap = msucc->pindex; 895 else 896 rightcap = rv->pindex; 897 if (first + VM_LEVEL_0_NPAGES > rightcap) { 898 vm_reserv_object_unlock(object); 899 return (NULL); 900 } 901 } 902 vm_reserv_object_unlock(object); 903 904 /* 905 * Would the last new reservation extend past the end of the object? 906 * 907 * If the object is unlikely to grow don't allocate a reservation for 908 * the tail. 909 */ 910 if ((object->flags & OBJ_ANON) == 0 && 911 first + VM_LEVEL_0_NPAGES > object->size) 912 return (NULL); 913 914 /* 915 * Allocate and populate the new reservation. 916 */ 917 m = NULL; 918 vmd = VM_DOMAIN(domain); 919 if (vm_domain_allocate(vmd, req, 1)) { 920 vm_domain_free_lock(vmd); 921 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, 922 VM_LEVEL_0_ORDER); 923 vm_domain_free_unlock(vmd); 924 if (m == NULL) { 925 vm_domain_freecnt_inc(vmd, 1); 926 return (NULL); 927 } 928 } else 929 return (NULL); 930 rv = vm_reserv_from_page(m); 931 vm_reserv_lock(rv); 932 KASSERT(rv->pages == m, 933 ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv)); 934 vm_reserv_insert(rv, object, first); 935 index = VM_RESERV_INDEX(object, pindex); 936 vm_reserv_populate(rv, index); 937 vm_reserv_unlock(rv); 938 939 return (&rv->pages[index]); 940 } 941 942 /* 943 * Breaks the given reservation. All free pages in the reservation 944 * are returned to the physical memory allocator. The reservation's 945 * population count and map are reset to their initial state. 946 * 947 * The given reservation must not be in the partially populated reservation 948 * queue. 949 */ 950 static void 951 vm_reserv_break(vm_reserv_t rv) 952 { 953 u_long changes; 954 int bitpos, hi, i, lo; 955 956 vm_reserv_assert_locked(rv); 957 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 958 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 959 vm_reserv_remove(rv); 960 rv->pages->psind = 0; 961 hi = lo = -1; 962 for (i = 0; i <= NPOPMAP; i++) { 963 /* 964 * "changes" is a bitmask that marks where a new sequence of 965 * 0s or 1s begins in popmap[i], with last bit in popmap[i-1] 966 * considered to be 1 if and only if lo == hi. The bits of 967 * popmap[-1] and popmap[NPOPMAP] are considered all 1s. 968 */ 969 if (i == NPOPMAP) 970 changes = lo != hi; 971 else { 972 changes = rv->popmap[i]; 973 changes ^= (changes << 1) | (lo == hi); 974 rv->popmap[i] = 0; 975 } 976 while (changes != 0) { 977 /* 978 * If the next change marked begins a run of 0s, set 979 * lo to mark that position. Otherwise set hi and 980 * free pages from lo up to hi. 981 */ 982 bitpos = ffsl(changes) - 1; 983 changes ^= 1UL << bitpos; 984 if (lo == hi) 985 lo = NBPOPMAP * i + bitpos; 986 else { 987 hi = NBPOPMAP * i + bitpos; 988 vm_domain_free_lock(VM_DOMAIN(rv->domain)); 989 vm_phys_enqueue_contig(&rv->pages[lo], hi - lo); 990 vm_domain_free_unlock(VM_DOMAIN(rv->domain)); 991 lo = hi; 992 } 993 } 994 } 995 rv->popcnt = 0; 996 counter_u64_add(vm_reserv_broken, 1); 997 } 998 999 /* 1000 * Breaks all reservations belonging to the given object. 1001 */ 1002 void 1003 vm_reserv_break_all(vm_object_t object) 1004 { 1005 vm_reserv_t rv; 1006 1007 /* 1008 * This access of object->rvq is unsynchronized so that the 1009 * object rvq lock can nest after the domain_free lock. We 1010 * must check for races in the results. However, the object 1011 * lock prevents new additions, so we are guaranteed that when 1012 * it returns NULL the object is properly empty. 1013 */ 1014 while ((rv = LIST_FIRST(&object->rvq)) != NULL) { 1015 vm_reserv_lock(rv); 1016 /* Reclaim race. */ 1017 if (rv->object != object) { 1018 vm_reserv_unlock(rv); 1019 continue; 1020 } 1021 vm_reserv_domain_lock(rv->domain); 1022 if (rv->inpartpopq) { 1023 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); 1024 rv->inpartpopq = FALSE; 1025 } 1026 vm_reserv_domain_unlock(rv->domain); 1027 vm_reserv_break(rv); 1028 vm_reserv_unlock(rv); 1029 } 1030 } 1031 1032 /* 1033 * Frees the given page if it belongs to a reservation. Returns TRUE if the 1034 * page is freed and FALSE otherwise. 1035 */ 1036 boolean_t 1037 vm_reserv_free_page(vm_page_t m) 1038 { 1039 vm_reserv_t rv; 1040 boolean_t ret; 1041 1042 rv = vm_reserv_from_page(m); 1043 if (rv->object == NULL) 1044 return (FALSE); 1045 vm_reserv_lock(rv); 1046 /* Re-validate after lock. */ 1047 if (rv->object != NULL) { 1048 vm_reserv_depopulate(rv, m - rv->pages); 1049 ret = TRUE; 1050 } else 1051 ret = FALSE; 1052 vm_reserv_unlock(rv); 1053 1054 return (ret); 1055 } 1056 1057 /* 1058 * Initializes the reservation management system. Specifically, initializes 1059 * the reservation array. 1060 * 1061 * Requires that vm_page_array and first_page are initialized! 1062 */ 1063 void 1064 vm_reserv_init(void) 1065 { 1066 vm_paddr_t paddr; 1067 struct vm_phys_seg *seg; 1068 struct vm_reserv *rv; 1069 struct vm_reserv_domain *rvd; 1070 #ifdef VM_PHYSSEG_SPARSE 1071 vm_pindex_t used; 1072 #endif 1073 int i, j, segind; 1074 1075 /* 1076 * Initialize the reservation array. Specifically, initialize the 1077 * "pages" field for every element that has an underlying superpage. 1078 */ 1079 #ifdef VM_PHYSSEG_SPARSE 1080 used = 0; 1081 #endif 1082 for (segind = 0; segind < vm_phys_nsegs; segind++) { 1083 seg = &vm_phys_segs[segind]; 1084 #ifdef VM_PHYSSEG_SPARSE 1085 seg->first_reserv = &vm_reserv_array[used]; 1086 used += howmany(seg->end, VM_LEVEL_0_SIZE) - 1087 seg->start / VM_LEVEL_0_SIZE; 1088 #else 1089 seg->first_reserv = 1090 &vm_reserv_array[seg->start >> VM_LEVEL_0_SHIFT]; 1091 #endif 1092 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); 1093 rv = seg->first_reserv + (paddr >> VM_LEVEL_0_SHIFT) - 1094 (seg->start >> VM_LEVEL_0_SHIFT); 1095 while (paddr + VM_LEVEL_0_SIZE > paddr && paddr + 1096 VM_LEVEL_0_SIZE <= seg->end) { 1097 rv->pages = PHYS_TO_VM_PAGE(paddr); 1098 rv->domain = seg->domain; 1099 mtx_init(&rv->lock, "vm reserv", NULL, MTX_DEF); 1100 paddr += VM_LEVEL_0_SIZE; 1101 rv++; 1102 } 1103 } 1104 for (i = 0; i < MAXMEMDOM; i++) { 1105 rvd = &vm_rvd[i]; 1106 mtx_init(&rvd->lock, "vm reserv domain", NULL, MTX_DEF); 1107 TAILQ_INIT(&rvd->partpop); 1108 mtx_init(&rvd->marker.lock, "vm reserv marker", NULL, MTX_DEF); 1109 1110 /* 1111 * Fully populated reservations should never be present in the 1112 * partially populated reservation queues. 1113 */ 1114 rvd->marker.popcnt = VM_LEVEL_0_NPAGES; 1115 for (j = 0; j < NBPOPMAP; j++) 1116 popmap_set(rvd->marker.popmap, j); 1117 } 1118 1119 for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++) 1120 mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL, 1121 MTX_DEF); 1122 } 1123 1124 /* 1125 * Returns true if the given page belongs to a reservation and that page is 1126 * free. Otherwise, returns false. 1127 */ 1128 bool 1129 vm_reserv_is_page_free(vm_page_t m) 1130 { 1131 vm_reserv_t rv; 1132 1133 rv = vm_reserv_from_page(m); 1134 if (rv->object == NULL) 1135 return (false); 1136 return (popmap_is_clear(rv->popmap, m - rv->pages)); 1137 } 1138 1139 /* 1140 * If the given page belongs to a reservation, returns the level of that 1141 * reservation. Otherwise, returns -1. 1142 */ 1143 int 1144 vm_reserv_level(vm_page_t m) 1145 { 1146 vm_reserv_t rv; 1147 1148 rv = vm_reserv_from_page(m); 1149 return (rv->object != NULL ? 0 : -1); 1150 } 1151 1152 /* 1153 * Returns a reservation level if the given page belongs to a fully populated 1154 * reservation and -1 otherwise. 1155 */ 1156 int 1157 vm_reserv_level_iffullpop(vm_page_t m) 1158 { 1159 vm_reserv_t rv; 1160 1161 rv = vm_reserv_from_page(m); 1162 return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1); 1163 } 1164 1165 /* 1166 * Remove a partially populated reservation from the queue. 1167 */ 1168 static void 1169 vm_reserv_dequeue(vm_reserv_t rv) 1170 { 1171 1172 vm_reserv_domain_assert_locked(rv->domain); 1173 vm_reserv_assert_locked(rv); 1174 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 1175 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 1176 KASSERT(rv->inpartpopq, 1177 ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv)); 1178 1179 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); 1180 rv->inpartpopq = FALSE; 1181 } 1182 1183 /* 1184 * Breaks the given partially populated reservation, releasing its free pages 1185 * to the physical memory allocator. 1186 */ 1187 static void 1188 vm_reserv_reclaim(vm_reserv_t rv) 1189 { 1190 1191 vm_reserv_assert_locked(rv); 1192 CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", 1193 __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); 1194 if (rv->inpartpopq) { 1195 vm_reserv_domain_lock(rv->domain); 1196 vm_reserv_dequeue(rv); 1197 vm_reserv_domain_unlock(rv->domain); 1198 } 1199 vm_reserv_break(rv); 1200 counter_u64_add(vm_reserv_reclaimed, 1); 1201 } 1202 1203 /* 1204 * Breaks a reservation near the head of the partially populated reservation 1205 * queue, releasing its free pages to the physical memory allocator. Returns 1206 * TRUE if a reservation is broken and FALSE otherwise. 1207 */ 1208 bool 1209 vm_reserv_reclaim_inactive(int domain) 1210 { 1211 vm_reserv_t rv; 1212 1213 vm_reserv_domain_lock(domain); 1214 TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) { 1215 /* 1216 * A locked reservation is likely being updated or reclaimed, 1217 * so just skip ahead. 1218 */ 1219 if (rv != &vm_rvd[domain].marker && vm_reserv_trylock(rv)) { 1220 vm_reserv_dequeue(rv); 1221 break; 1222 } 1223 } 1224 vm_reserv_domain_unlock(domain); 1225 if (rv != NULL) { 1226 vm_reserv_reclaim(rv); 1227 vm_reserv_unlock(rv); 1228 return (true); 1229 } 1230 return (false); 1231 } 1232 1233 /* 1234 * Determine whether this reservation has free pages that satisfy the given 1235 * request for contiguous physical memory. Start searching from the lower 1236 * bound, defined by low_index. 1237 */ 1238 static bool 1239 vm_reserv_test_contig(vm_reserv_t rv, u_long npages, vm_paddr_t low, 1240 vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1241 { 1242 vm_paddr_t pa, size; 1243 u_long changes; 1244 int bitpos, bits_left, i, hi, lo, n; 1245 1246 vm_reserv_assert_locked(rv); 1247 size = npages << PAGE_SHIFT; 1248 pa = VM_PAGE_TO_PHYS(&rv->pages[0]); 1249 lo = (pa < low) ? 1250 ((low + PAGE_MASK - pa) >> PAGE_SHIFT) : 0; 1251 i = lo / NBPOPMAP; 1252 changes = rv->popmap[i] | ((1UL << (lo % NBPOPMAP)) - 1); 1253 hi = (pa + VM_LEVEL_0_SIZE > high) ? 1254 ((high + PAGE_MASK - pa) >> PAGE_SHIFT) : VM_LEVEL_0_NPAGES; 1255 n = hi / NBPOPMAP; 1256 bits_left = hi % NBPOPMAP; 1257 hi = lo = -1; 1258 for (;;) { 1259 /* 1260 * "changes" is a bitmask that marks where a new sequence of 1261 * 0s or 1s begins in popmap[i], with last bit in popmap[i-1] 1262 * considered to be 1 if and only if lo == hi. The bits of 1263 * popmap[-1] and popmap[NPOPMAP] are considered all 1s. 1264 */ 1265 changes ^= (changes << 1) | (lo == hi); 1266 while (changes != 0) { 1267 /* 1268 * If the next change marked begins a run of 0s, set 1269 * lo to mark that position. Otherwise set hi and 1270 * look for a satisfactory first page from lo up to hi. 1271 */ 1272 bitpos = ffsl(changes) - 1; 1273 changes ^= 1UL << bitpos; 1274 if (lo == hi) { 1275 lo = NBPOPMAP * i + bitpos; 1276 continue; 1277 } 1278 hi = NBPOPMAP * i + bitpos; 1279 pa = VM_PAGE_TO_PHYS(&rv->pages[lo]); 1280 if ((pa & (alignment - 1)) != 0) { 1281 /* Skip to next aligned page. */ 1282 lo += (((pa - 1) | (alignment - 1)) + 1) >> 1283 PAGE_SHIFT; 1284 if (lo >= VM_LEVEL_0_NPAGES) 1285 return (false); 1286 pa = VM_PAGE_TO_PHYS(&rv->pages[lo]); 1287 } 1288 if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) { 1289 /* Skip to next boundary-matching page. */ 1290 lo += (((pa - 1) | (boundary - 1)) + 1) >> 1291 PAGE_SHIFT; 1292 if (lo >= VM_LEVEL_0_NPAGES) 1293 return (false); 1294 pa = VM_PAGE_TO_PHYS(&rv->pages[lo]); 1295 } 1296 if (lo * PAGE_SIZE + size <= hi * PAGE_SIZE) 1297 return (true); 1298 lo = hi; 1299 } 1300 if (++i < n) 1301 changes = rv->popmap[i]; 1302 else if (i == n) 1303 changes = bits_left == 0 ? -1UL : 1304 (rv->popmap[n] | (-1UL << bits_left)); 1305 else 1306 return (false); 1307 } 1308 } 1309 1310 /* 1311 * Searches the partially populated reservation queue for the least recently 1312 * changed reservation with free pages that satisfy the given request for 1313 * contiguous physical memory. If a satisfactory reservation is found, it is 1314 * broken. Returns true if a reservation is broken and false otherwise. 1315 */ 1316 bool 1317 vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low, 1318 vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 1319 { 1320 struct vm_reserv_queue *queue; 1321 vm_paddr_t pa, size; 1322 vm_reserv_t marker, rv, rvn; 1323 1324 if (npages > VM_LEVEL_0_NPAGES - 1) 1325 return (false); 1326 marker = &vm_rvd[domain].marker; 1327 queue = &vm_rvd[domain].partpop; 1328 size = npages << PAGE_SHIFT; 1329 1330 vm_reserv_domain_scan_lock(domain); 1331 vm_reserv_domain_lock(domain); 1332 TAILQ_FOREACH_SAFE(rv, queue, partpopq, rvn) { 1333 pa = VM_PAGE_TO_PHYS(&rv->pages[0]); 1334 if (pa + VM_LEVEL_0_SIZE - size < low) { 1335 /* This entire reservation is too low; go to next. */ 1336 continue; 1337 } 1338 if (pa + size > high) { 1339 /* This entire reservation is too high; go to next. */ 1340 continue; 1341 } 1342 1343 if (vm_reserv_trylock(rv) == 0) { 1344 TAILQ_INSERT_AFTER(queue, rv, marker, partpopq); 1345 vm_reserv_domain_unlock(domain); 1346 vm_reserv_lock(rv); 1347 if (TAILQ_PREV(marker, vm_reserv_queue, partpopq) != 1348 rv) { 1349 vm_reserv_unlock(rv); 1350 vm_reserv_domain_lock(domain); 1351 rvn = TAILQ_NEXT(marker, partpopq); 1352 TAILQ_REMOVE(queue, marker, partpopq); 1353 continue; 1354 } 1355 vm_reserv_domain_lock(domain); 1356 TAILQ_REMOVE(queue, marker, partpopq); 1357 } 1358 vm_reserv_domain_unlock(domain); 1359 if (vm_reserv_test_contig(rv, npages, low, high, 1360 alignment, boundary)) { 1361 vm_reserv_domain_scan_unlock(domain); 1362 vm_reserv_reclaim(rv); 1363 vm_reserv_unlock(rv); 1364 return (true); 1365 } 1366 vm_reserv_domain_lock(domain); 1367 rvn = TAILQ_NEXT(rv, partpopq); 1368 vm_reserv_unlock(rv); 1369 } 1370 vm_reserv_domain_unlock(domain); 1371 vm_reserv_domain_scan_unlock(domain); 1372 return (false); 1373 } 1374 1375 /* 1376 * Transfers the reservation underlying the given page to a new object. 1377 * 1378 * The object must be locked. 1379 */ 1380 void 1381 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, 1382 vm_pindex_t old_object_offset) 1383 { 1384 vm_reserv_t rv; 1385 1386 VM_OBJECT_ASSERT_WLOCKED(new_object); 1387 rv = vm_reserv_from_page(m); 1388 if (rv->object == old_object) { 1389 vm_reserv_lock(rv); 1390 CTR6(KTR_VM, 1391 "%s: rv %p object %p new %p popcnt %d inpartpop %d", 1392 __FUNCTION__, rv, rv->object, new_object, rv->popcnt, 1393 rv->inpartpopq); 1394 if (rv->object == old_object) { 1395 vm_reserv_object_lock(old_object); 1396 rv->object = NULL; 1397 LIST_REMOVE(rv, objq); 1398 vm_reserv_object_unlock(old_object); 1399 vm_reserv_object_lock(new_object); 1400 rv->object = new_object; 1401 rv->pindex -= old_object_offset; 1402 LIST_INSERT_HEAD(&new_object->rvq, rv, objq); 1403 vm_reserv_object_unlock(new_object); 1404 } 1405 vm_reserv_unlock(rv); 1406 } 1407 } 1408 1409 /* 1410 * Returns the size (in bytes) of a reservation of the specified level. 1411 */ 1412 int 1413 vm_reserv_size(int level) 1414 { 1415 1416 switch (level) { 1417 case 0: 1418 return (VM_LEVEL_0_SIZE); 1419 case -1: 1420 return (PAGE_SIZE); 1421 default: 1422 return (0); 1423 } 1424 } 1425 1426 /* 1427 * Allocates the virtual and physical memory required by the reservation 1428 * management system's data structures, in particular, the reservation array. 1429 */ 1430 vm_paddr_t 1431 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end) 1432 { 1433 vm_paddr_t new_end; 1434 vm_pindex_t count; 1435 size_t size; 1436 int i; 1437 1438 count = 0; 1439 for (i = 0; i < vm_phys_nsegs; i++) { 1440 #ifdef VM_PHYSSEG_SPARSE 1441 count += howmany(vm_phys_segs[i].end, VM_LEVEL_0_SIZE) - 1442 vm_phys_segs[i].start / VM_LEVEL_0_SIZE; 1443 #else 1444 count = MAX(count, 1445 howmany(vm_phys_segs[i].end, VM_LEVEL_0_SIZE)); 1446 #endif 1447 } 1448 1449 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 1450 #ifdef VM_PHYSSEG_SPARSE 1451 count += howmany(phys_avail[i + 1], VM_LEVEL_0_SIZE) - 1452 phys_avail[i] / VM_LEVEL_0_SIZE; 1453 #else 1454 count = MAX(count, 1455 howmany(phys_avail[i + 1], VM_LEVEL_0_SIZE)); 1456 #endif 1457 } 1458 1459 /* 1460 * Calculate the size (in bytes) of the reservation array. Rounding up 1461 * for partial superpages at boundaries, as every small page is mapped 1462 * to an element in the reservation array based on its physical address. 1463 * Thus, the number of elements in the reservation array can be greater 1464 * than the number of superpages. 1465 */ 1466 size = count * sizeof(struct vm_reserv); 1467 1468 /* 1469 * Allocate and map the physical memory for the reservation array. The 1470 * next available virtual address is returned by reference. 1471 */ 1472 new_end = end - round_page(size); 1473 vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end, 1474 VM_PROT_READ | VM_PROT_WRITE); 1475 bzero(vm_reserv_array, size); 1476 1477 /* 1478 * Return the next available physical address. 1479 */ 1480 return (new_end); 1481 } 1482 1483 /* 1484 * Returns the superpage containing the given page. 1485 */ 1486 vm_page_t 1487 vm_reserv_to_superpage(vm_page_t m) 1488 { 1489 vm_reserv_t rv; 1490 1491 VM_OBJECT_ASSERT_LOCKED(m->object); 1492 rv = vm_reserv_from_page(m); 1493 if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES) 1494 m = rv->pages; 1495 else 1496 m = NULL; 1497 1498 return (m); 1499 } 1500 1501 #endif /* VM_NRESERVLEVEL > 0 */ 1502