xref: /freebsd/sys/vm/vm_reserv.c (revision f0cfa1b168014f56c02b83e5f28412cc5f78d117)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2002-2006 Rice University
5  * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu>
6  * All rights reserved.
7  *
8  * This software was developed for the FreeBSD Project by Alan L. Cox,
9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  *	Superpage reservation management module
36  *
37  * Any external functions defined by this module are only to be used by the
38  * virtual memory system.
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_vm.h"
45 
46 #include <sys/param.h>
47 #include <sys/kernel.h>
48 #include <sys/lock.h>
49 #include <sys/malloc.h>
50 #include <sys/mutex.h>
51 #include <sys/queue.h>
52 #include <sys/rwlock.h>
53 #include <sys/sbuf.h>
54 #include <sys/sysctl.h>
55 #include <sys/systm.h>
56 #include <sys/vmmeter.h>
57 
58 #include <vm/vm.h>
59 #include <vm/vm_param.h>
60 #include <vm/vm_object.h>
61 #include <vm/vm_page.h>
62 #include <vm/vm_phys.h>
63 #include <vm/vm_radix.h>
64 #include <vm/vm_reserv.h>
65 
66 /*
67  * The reservation system supports the speculative allocation of large physical
68  * pages ("superpages").  Speculative allocation enables the fully automatic
69  * utilization of superpages by the virtual memory system.  In other words, no
70  * programmatic directives are required to use superpages.
71  */
72 
73 #if VM_NRESERVLEVEL > 0
74 
75 /*
76  * The number of small pages that are contained in a level 0 reservation
77  */
78 #define	VM_LEVEL_0_NPAGES	(1 << VM_LEVEL_0_ORDER)
79 
80 /*
81  * The number of bits by which a physical address is shifted to obtain the
82  * reservation number
83  */
84 #define	VM_LEVEL_0_SHIFT	(VM_LEVEL_0_ORDER + PAGE_SHIFT)
85 
86 /*
87  * The size of a level 0 reservation in bytes
88  */
89 #define	VM_LEVEL_0_SIZE		(1 << VM_LEVEL_0_SHIFT)
90 
91 /*
92  * Computes the index of the small page underlying the given (object, pindex)
93  * within the reservation's array of small pages.
94  */
95 #define	VM_RESERV_INDEX(object, pindex)	\
96     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
97 
98 /*
99  * The size of a population map entry
100  */
101 typedef	u_long		popmap_t;
102 
103 /*
104  * The number of bits in a population map entry
105  */
106 #define	NBPOPMAP	(NBBY * sizeof(popmap_t))
107 
108 /*
109  * The number of population map entries in a reservation
110  */
111 #define	NPOPMAP		howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
112 
113 /*
114  * Clear a bit in the population map.
115  */
116 static __inline void
117 popmap_clear(popmap_t popmap[], int i)
118 {
119 
120 	popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
121 }
122 
123 /*
124  * Set a bit in the population map.
125  */
126 static __inline void
127 popmap_set(popmap_t popmap[], int i)
128 {
129 
130 	popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
131 }
132 
133 /*
134  * Is a bit in the population map clear?
135  */
136 static __inline boolean_t
137 popmap_is_clear(popmap_t popmap[], int i)
138 {
139 
140 	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
141 }
142 
143 /*
144  * Is a bit in the population map set?
145  */
146 static __inline boolean_t
147 popmap_is_set(popmap_t popmap[], int i)
148 {
149 
150 	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
151 }
152 
153 /*
154  * The reservation structure
155  *
156  * A reservation structure is constructed whenever a large physical page is
157  * speculatively allocated to an object.  The reservation provides the small
158  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
159  * within that object.  The reservation's "popcnt" tracks the number of these
160  * small physical pages that are in use at any given time.  When and if the
161  * reservation is not fully utilized, it appears in the queue of partially
162  * populated reservations.  The reservation always appears on the containing
163  * object's list of reservations.
164  *
165  * A partially populated reservation can be broken and reclaimed at any time.
166  */
167 struct vm_reserv {
168 	TAILQ_ENTRY(vm_reserv) partpopq;
169 	LIST_ENTRY(vm_reserv) objq;
170 	vm_object_t	object;			/* containing object */
171 	vm_pindex_t	pindex;			/* offset within object */
172 	vm_page_t	pages;			/* first page of a superpage */
173 	int		popcnt;			/* # of pages in use */
174 	char		inpartpopq;
175 	popmap_t	popmap[NPOPMAP];	/* bit vector of used pages */
176 };
177 
178 /*
179  * The reservation array
180  *
181  * This array is analoguous in function to vm_page_array.  It differs in the
182  * respect that it may contain a greater number of useful reservation
183  * structures than there are (physical) superpages.  These "invalid"
184  * reservation structures exist to trade-off space for time in the
185  * implementation of vm_reserv_from_page().  Invalid reservation structures are
186  * distinguishable from "valid" reservation structures by inspecting the
187  * reservation's "pages" field.  Invalid reservation structures have a NULL
188  * "pages" field.
189  *
190  * vm_reserv_from_page() maps a small (physical) page to an element of this
191  * array by computing a physical reservation number from the page's physical
192  * address.  The physical reservation number is used as the array index.
193  *
194  * An "active" reservation is a valid reservation structure that has a non-NULL
195  * "object" field and a non-zero "popcnt" field.  In other words, every active
196  * reservation belongs to a particular object.  Moreover, every active
197  * reservation has an entry in the containing object's list of reservations.
198  */
199 static vm_reserv_t vm_reserv_array;
200 
201 /*
202  * The partially populated reservation queue
203  *
204  * This queue enables the fast recovery of an unused free small page from a
205  * partially populated reservation.  The reservation at the head of this queue
206  * is the least recently changed, partially populated reservation.
207  *
208  * Access to this queue is synchronized by the free page queue lock.
209  */
210 static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop =
211 			    TAILQ_HEAD_INITIALIZER(vm_rvq_partpop);
212 
213 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
214 
215 static long vm_reserv_broken;
216 SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
217     &vm_reserv_broken, 0, "Cumulative number of broken reservations");
218 
219 static long vm_reserv_freed;
220 SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
221     &vm_reserv_freed, 0, "Cumulative number of freed reservations");
222 
223 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
224 
225 SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
226     sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
227 
228 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
229 
230 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
231     sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues");
232 
233 static long vm_reserv_reclaimed;
234 SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
235     &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
236 
237 static void		vm_reserv_break(vm_reserv_t rv, vm_page_t m);
238 static void		vm_reserv_depopulate(vm_reserv_t rv, int index);
239 static vm_reserv_t	vm_reserv_from_page(vm_page_t m);
240 static boolean_t	vm_reserv_has_pindex(vm_reserv_t rv,
241 			    vm_pindex_t pindex);
242 static void		vm_reserv_populate(vm_reserv_t rv, int index);
243 static void		vm_reserv_reclaim(vm_reserv_t rv);
244 
245 /*
246  * Returns the current number of full reservations.
247  *
248  * Since the number of full reservations is computed without acquiring the
249  * free page queue lock, the returned value may be inexact.
250  */
251 static int
252 sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
253 {
254 	vm_paddr_t paddr;
255 	struct vm_phys_seg *seg;
256 	vm_reserv_t rv;
257 	int fullpop, segind;
258 
259 	fullpop = 0;
260 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
261 		seg = &vm_phys_segs[segind];
262 		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
263 		while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
264 			rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
265 			fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
266 			paddr += VM_LEVEL_0_SIZE;
267 		}
268 	}
269 	return (sysctl_handle_int(oidp, &fullpop, 0, req));
270 }
271 
272 /*
273  * Describes the current state of the partially populated reservation queue.
274  */
275 static int
276 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
277 {
278 	struct sbuf sbuf;
279 	vm_reserv_t rv;
280 	int counter, error, level, unused_pages;
281 
282 	error = sysctl_wire_old_buffer(req, 0);
283 	if (error != 0)
284 		return (error);
285 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
286 	sbuf_printf(&sbuf, "\nLEVEL     SIZE  NUMBER\n\n");
287 	for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
288 		counter = 0;
289 		unused_pages = 0;
290 		mtx_lock(&vm_page_queue_free_mtx);
291 		TAILQ_FOREACH(rv, &vm_rvq_partpop/*[level]*/, partpopq) {
292 			counter++;
293 			unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
294 		}
295 		mtx_unlock(&vm_page_queue_free_mtx);
296 		sbuf_printf(&sbuf, "%5d: %6dK, %6d\n", level,
297 		    unused_pages * ((int)PAGE_SIZE / 1024), counter);
298 	}
299 	error = sbuf_finish(&sbuf);
300 	sbuf_delete(&sbuf);
301 	return (error);
302 }
303 
304 /*
305  * Reduces the given reservation's population count.  If the population count
306  * becomes zero, the reservation is destroyed.  Additionally, moves the
307  * reservation to the tail of the partially populated reservation queue if the
308  * population count is non-zero.
309  *
310  * The free page queue lock must be held.
311  */
312 static void
313 vm_reserv_depopulate(vm_reserv_t rv, int index)
314 {
315 
316 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
317 	KASSERT(rv->object != NULL,
318 	    ("vm_reserv_depopulate: reserv %p is free", rv));
319 	KASSERT(popmap_is_set(rv->popmap, index),
320 	    ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
321 	    index));
322 	KASSERT(rv->popcnt > 0,
323 	    ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
324 	if (rv->inpartpopq) {
325 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
326 		rv->inpartpopq = FALSE;
327 	} else {
328 		KASSERT(rv->pages->psind == 1,
329 		    ("vm_reserv_depopulate: reserv %p is already demoted",
330 		    rv));
331 		rv->pages->psind = 0;
332 	}
333 	popmap_clear(rv->popmap, index);
334 	rv->popcnt--;
335 	if (rv->popcnt == 0) {
336 		LIST_REMOVE(rv, objq);
337 		rv->object = NULL;
338 		vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
339 		vm_reserv_freed++;
340 	} else {
341 		rv->inpartpopq = TRUE;
342 		TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
343 	}
344 }
345 
346 /*
347  * Returns the reservation to which the given page might belong.
348  */
349 static __inline vm_reserv_t
350 vm_reserv_from_page(vm_page_t m)
351 {
352 
353 	return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
354 }
355 
356 /*
357  * Returns TRUE if the given reservation contains the given page index and
358  * FALSE otherwise.
359  */
360 static __inline boolean_t
361 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
362 {
363 
364 	return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
365 }
366 
367 /*
368  * Increases the given reservation's population count.  Moves the reservation
369  * to the tail of the partially populated reservation queue.
370  *
371  * The free page queue must be locked.
372  */
373 static void
374 vm_reserv_populate(vm_reserv_t rv, int index)
375 {
376 
377 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
378 	KASSERT(rv->object != NULL,
379 	    ("vm_reserv_populate: reserv %p is free", rv));
380 	KASSERT(popmap_is_clear(rv->popmap, index),
381 	    ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
382 	    index));
383 	KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
384 	    ("vm_reserv_populate: reserv %p is already full", rv));
385 	KASSERT(rv->pages->psind == 0,
386 	    ("vm_reserv_populate: reserv %p is already promoted", rv));
387 	if (rv->inpartpopq) {
388 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
389 		rv->inpartpopq = FALSE;
390 	}
391 	popmap_set(rv->popmap, index);
392 	rv->popcnt++;
393 	if (rv->popcnt < VM_LEVEL_0_NPAGES) {
394 		rv->inpartpopq = TRUE;
395 		TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
396 	} else
397 		rv->pages->psind = 1;
398 }
399 
400 /*
401  * Allocates a contiguous set of physical pages of the given size "npages"
402  * from existing or newly created reservations.  All of the physical pages
403  * must be at or above the given physical address "low" and below the given
404  * physical address "high".  The given value "alignment" determines the
405  * alignment of the first physical page in the set.  If the given value
406  * "boundary" is non-zero, then the set of physical pages cannot cross any
407  * physical address boundary that is a multiple of that value.  Both
408  * "alignment" and "boundary" must be a power of two.
409  *
410  * The page "mpred" must immediately precede the offset "pindex" within the
411  * specified object.
412  *
413  * The object and free page queue must be locked.
414  */
415 vm_page_t
416 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages,
417     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
418     vm_page_t mpred)
419 {
420 	vm_paddr_t pa, size;
421 	vm_page_t m, m_ret, msucc;
422 	vm_pindex_t first, leftcap, rightcap;
423 	vm_reserv_t rv;
424 	u_long allocpages, maxpages, minpages;
425 	int i, index, n;
426 
427 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
428 	VM_OBJECT_ASSERT_WLOCKED(object);
429 	KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
430 
431 	/*
432 	 * Is a reservation fundamentally impossible?
433 	 */
434 	if (pindex < VM_RESERV_INDEX(object, pindex) ||
435 	    pindex + npages > object->size)
436 		return (NULL);
437 
438 	/*
439 	 * All reservations of a particular size have the same alignment.
440 	 * Assuming that the first page is allocated from a reservation, the
441 	 * least significant bits of its physical address can be determined
442 	 * from its offset from the beginning of the reservation and the size
443 	 * of the reservation.
444 	 *
445 	 * Could the specified index within a reservation of the smallest
446 	 * possible size satisfy the alignment and boundary requirements?
447 	 */
448 	pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
449 	if ((pa & (alignment - 1)) != 0)
450 		return (NULL);
451 	size = npages << PAGE_SHIFT;
452 	if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
453 		return (NULL);
454 
455 	/*
456 	 * Look for an existing reservation.
457 	 */
458 	if (mpred != NULL) {
459 		KASSERT(mpred->object == object,
460 		    ("vm_reserv_alloc_contig: object doesn't contain mpred"));
461 		KASSERT(mpred->pindex < pindex,
462 		    ("vm_reserv_alloc_contig: mpred doesn't precede pindex"));
463 		rv = vm_reserv_from_page(mpred);
464 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
465 			goto found;
466 		msucc = TAILQ_NEXT(mpred, listq);
467 	} else
468 		msucc = TAILQ_FIRST(&object->memq);
469 	if (msucc != NULL) {
470 		KASSERT(msucc->pindex > pindex,
471 		    ("vm_reserv_alloc_contig: msucc doesn't succeed pindex"));
472 		rv = vm_reserv_from_page(msucc);
473 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
474 			goto found;
475 	}
476 
477 	/*
478 	 * Could at least one reservation fit between the first index to the
479 	 * left that can be used ("leftcap") and the first index to the right
480 	 * that cannot be used ("rightcap")?
481 	 */
482 	first = pindex - VM_RESERV_INDEX(object, pindex);
483 	if (mpred != NULL) {
484 		if ((rv = vm_reserv_from_page(mpred))->object != object)
485 			leftcap = mpred->pindex + 1;
486 		else
487 			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
488 		if (leftcap > first)
489 			return (NULL);
490 	}
491 	minpages = VM_RESERV_INDEX(object, pindex) + npages;
492 	maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
493 	allocpages = maxpages;
494 	if (msucc != NULL) {
495 		if ((rv = vm_reserv_from_page(msucc))->object != object)
496 			rightcap = msucc->pindex;
497 		else
498 			rightcap = rv->pindex;
499 		if (first + maxpages > rightcap) {
500 			if (maxpages == VM_LEVEL_0_NPAGES)
501 				return (NULL);
502 
503 			/*
504 			 * At least one reservation will fit between "leftcap"
505 			 * and "rightcap".  However, a reservation for the
506 			 * last of the requested pages will not fit.  Reduce
507 			 * the size of the upcoming allocation accordingly.
508 			 */
509 			allocpages = minpages;
510 		}
511 	}
512 
513 	/*
514 	 * Would the last new reservation extend past the end of the object?
515 	 */
516 	if (first + maxpages > object->size) {
517 		/*
518 		 * Don't allocate the last new reservation if the object is a
519 		 * vnode or backed by another object that is a vnode.
520 		 */
521 		if (object->type == OBJT_VNODE ||
522 		    (object->backing_object != NULL &&
523 		    object->backing_object->type == OBJT_VNODE)) {
524 			if (maxpages == VM_LEVEL_0_NPAGES)
525 				return (NULL);
526 			allocpages = minpages;
527 		}
528 		/* Speculate that the object may grow. */
529 	}
530 
531 	/*
532 	 * Allocate the physical pages.  The alignment and boundary specified
533 	 * for this allocation may be different from the alignment and
534 	 * boundary specified for the requested pages.  For instance, the
535 	 * specified index may not be the first page within the first new
536 	 * reservation.
537 	 */
538 	m = vm_phys_alloc_contig(allocpages, low, high, ulmax(alignment,
539 	    VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0);
540 	if (m == NULL)
541 		return (NULL);
542 
543 	/*
544 	 * The allocated physical pages always begin at a reservation
545 	 * boundary, but they do not always end at a reservation boundary.
546 	 * Initialize every reservation that is completely covered by the
547 	 * allocated physical pages.
548 	 */
549 	m_ret = NULL;
550 	index = VM_RESERV_INDEX(object, pindex);
551 	do {
552 		rv = vm_reserv_from_page(m);
553 		KASSERT(rv->pages == m,
554 		    ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
555 		    rv));
556 		KASSERT(rv->object == NULL,
557 		    ("vm_reserv_alloc_contig: reserv %p isn't free", rv));
558 		LIST_INSERT_HEAD(&object->rvq, rv, objq);
559 		rv->object = object;
560 		rv->pindex = first;
561 		KASSERT(rv->popcnt == 0,
562 		    ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted",
563 		    rv));
564 		KASSERT(!rv->inpartpopq,
565 		    ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE",
566 		    rv));
567 		for (i = 0; i < NPOPMAP; i++)
568 			KASSERT(rv->popmap[i] == 0,
569 		    ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted",
570 			    rv));
571 		n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
572 		for (i = 0; i < n; i++)
573 			vm_reserv_populate(rv, index + i);
574 		npages -= n;
575 		if (m_ret == NULL) {
576 			m_ret = &rv->pages[index];
577 			index = 0;
578 		}
579 		m += VM_LEVEL_0_NPAGES;
580 		first += VM_LEVEL_0_NPAGES;
581 		allocpages -= VM_LEVEL_0_NPAGES;
582 	} while (allocpages >= VM_LEVEL_0_NPAGES);
583 	return (m_ret);
584 
585 	/*
586 	 * Found a matching reservation.
587 	 */
588 found:
589 	index = VM_RESERV_INDEX(object, pindex);
590 	/* Does the allocation fit within the reservation? */
591 	if (index + npages > VM_LEVEL_0_NPAGES)
592 		return (NULL);
593 	m = &rv->pages[index];
594 	pa = VM_PAGE_TO_PHYS(m);
595 	if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
596 	    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
597 		return (NULL);
598 	/* Handle vm_page_rename(m, new_object, ...). */
599 	for (i = 0; i < npages; i++)
600 		if (popmap_is_set(rv->popmap, index + i))
601 			return (NULL);
602 	for (i = 0; i < npages; i++)
603 		vm_reserv_populate(rv, index + i);
604 	return (m);
605 }
606 
607 /*
608  * Allocates a page from an existing or newly created reservation.
609  *
610  * The page "mpred" must immediately precede the offset "pindex" within the
611  * specified object.
612  *
613  * The object and free page queue must be locked.
614  */
615 vm_page_t
616 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred)
617 {
618 	vm_page_t m, msucc;
619 	vm_pindex_t first, leftcap, rightcap;
620 	vm_reserv_t rv;
621 	int i, index;
622 
623 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
624 	VM_OBJECT_ASSERT_WLOCKED(object);
625 
626 	/*
627 	 * Is a reservation fundamentally impossible?
628 	 */
629 	if (pindex < VM_RESERV_INDEX(object, pindex) ||
630 	    pindex >= object->size)
631 		return (NULL);
632 
633 	/*
634 	 * Look for an existing reservation.
635 	 */
636 	if (mpred != NULL) {
637 		KASSERT(mpred->object == object,
638 		    ("vm_reserv_alloc_page: object doesn't contain mpred"));
639 		KASSERT(mpred->pindex < pindex,
640 		    ("vm_reserv_alloc_page: mpred doesn't precede pindex"));
641 		rv = vm_reserv_from_page(mpred);
642 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
643 			goto found;
644 		msucc = TAILQ_NEXT(mpred, listq);
645 	} else
646 		msucc = TAILQ_FIRST(&object->memq);
647 	if (msucc != NULL) {
648 		KASSERT(msucc->pindex > pindex,
649 		    ("vm_reserv_alloc_page: msucc doesn't succeed pindex"));
650 		rv = vm_reserv_from_page(msucc);
651 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
652 			goto found;
653 	}
654 
655 	/*
656 	 * Could a reservation fit between the first index to the left that
657 	 * can be used and the first index to the right that cannot be used?
658 	 */
659 	first = pindex - VM_RESERV_INDEX(object, pindex);
660 	if (mpred != NULL) {
661 		if ((rv = vm_reserv_from_page(mpred))->object != object)
662 			leftcap = mpred->pindex + 1;
663 		else
664 			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
665 		if (leftcap > first)
666 			return (NULL);
667 	}
668 	if (msucc != NULL) {
669 		if ((rv = vm_reserv_from_page(msucc))->object != object)
670 			rightcap = msucc->pindex;
671 		else
672 			rightcap = rv->pindex;
673 		if (first + VM_LEVEL_0_NPAGES > rightcap)
674 			return (NULL);
675 	}
676 
677 	/*
678 	 * Would a new reservation extend past the end of the object?
679 	 */
680 	if (first + VM_LEVEL_0_NPAGES > object->size) {
681 		/*
682 		 * Don't allocate a new reservation if the object is a vnode or
683 		 * backed by another object that is a vnode.
684 		 */
685 		if (object->type == OBJT_VNODE ||
686 		    (object->backing_object != NULL &&
687 		    object->backing_object->type == OBJT_VNODE))
688 			return (NULL);
689 		/* Speculate that the object may grow. */
690 	}
691 
692 	/*
693 	 * Allocate and populate the new reservation.
694 	 */
695 	m = vm_phys_alloc_pages(VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER);
696 	if (m == NULL)
697 		return (NULL);
698 	rv = vm_reserv_from_page(m);
699 	KASSERT(rv->pages == m,
700 	    ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
701 	KASSERT(rv->object == NULL,
702 	    ("vm_reserv_alloc_page: reserv %p isn't free", rv));
703 	LIST_INSERT_HEAD(&object->rvq, rv, objq);
704 	rv->object = object;
705 	rv->pindex = first;
706 	KASSERT(rv->popcnt == 0,
707 	    ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv));
708 	KASSERT(!rv->inpartpopq,
709 	    ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv));
710 	for (i = 0; i < NPOPMAP; i++)
711 		KASSERT(rv->popmap[i] == 0,
712 		    ("vm_reserv_alloc_page: reserv %p's popmap is corrupted",
713 		    rv));
714 	index = VM_RESERV_INDEX(object, pindex);
715 	vm_reserv_populate(rv, index);
716 	return (&rv->pages[index]);
717 
718 	/*
719 	 * Found a matching reservation.
720 	 */
721 found:
722 	index = VM_RESERV_INDEX(object, pindex);
723 	m = &rv->pages[index];
724 	/* Handle vm_page_rename(m, new_object, ...). */
725 	if (popmap_is_set(rv->popmap, index))
726 		return (NULL);
727 	vm_reserv_populate(rv, index);
728 	return (m);
729 }
730 
731 /*
732  * Breaks the given reservation.  Except for the specified free page, all free
733  * pages in the reservation are returned to the physical memory allocator.
734  * The reservation's population count and map are reset to their initial
735  * state.
736  *
737  * The given reservation must not be in the partially populated reservation
738  * queue.  The free page queue lock must be held.
739  */
740 static void
741 vm_reserv_break(vm_reserv_t rv, vm_page_t m)
742 {
743 	int begin_zeroes, hi, i, lo;
744 
745 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
746 	KASSERT(rv->object != NULL,
747 	    ("vm_reserv_break: reserv %p is free", rv));
748 	KASSERT(!rv->inpartpopq,
749 	    ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv));
750 	LIST_REMOVE(rv, objq);
751 	rv->object = NULL;
752 	if (m != NULL) {
753 		/*
754 		 * Since the reservation is being broken, there is no harm in
755 		 * abusing the population map to stop "m" from being returned
756 		 * to the physical memory allocator.
757 		 */
758 		i = m - rv->pages;
759 		KASSERT(popmap_is_clear(rv->popmap, i),
760 		    ("vm_reserv_break: reserv %p's popmap is corrupted", rv));
761 		popmap_set(rv->popmap, i);
762 		rv->popcnt++;
763 	}
764 	i = hi = 0;
765 	do {
766 		/* Find the next 0 bit.  Any previous 0 bits are < "hi". */
767 		lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
768 		if (lo == 0) {
769 			/* Redundantly clears bits < "hi". */
770 			rv->popmap[i] = 0;
771 			rv->popcnt -= NBPOPMAP - hi;
772 			while (++i < NPOPMAP) {
773 				lo = ffsl(~rv->popmap[i]);
774 				if (lo == 0) {
775 					rv->popmap[i] = 0;
776 					rv->popcnt -= NBPOPMAP;
777 				} else
778 					break;
779 			}
780 			if (i == NPOPMAP)
781 				break;
782 			hi = 0;
783 		}
784 		KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo));
785 		/* Convert from ffsl() to ordinary bit numbering. */
786 		lo--;
787 		if (lo > 0) {
788 			/* Redundantly clears bits < "hi". */
789 			rv->popmap[i] &= ~((1UL << lo) - 1);
790 			rv->popcnt -= lo - hi;
791 		}
792 		begin_zeroes = NBPOPMAP * i + lo;
793 		/* Find the next 1 bit. */
794 		do
795 			hi = ffsl(rv->popmap[i]);
796 		while (hi == 0 && ++i < NPOPMAP);
797 		if (i != NPOPMAP)
798 			/* Convert from ffsl() to ordinary bit numbering. */
799 			hi--;
800 		vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i +
801 		    hi - begin_zeroes);
802 	} while (i < NPOPMAP);
803 	KASSERT(rv->popcnt == 0,
804 	    ("vm_reserv_break: reserv %p's popcnt is corrupted", rv));
805 	vm_reserv_broken++;
806 }
807 
808 /*
809  * Breaks all reservations belonging to the given object.
810  */
811 void
812 vm_reserv_break_all(vm_object_t object)
813 {
814 	vm_reserv_t rv;
815 
816 	mtx_lock(&vm_page_queue_free_mtx);
817 	while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
818 		KASSERT(rv->object == object,
819 		    ("vm_reserv_break_all: reserv %p is corrupted", rv));
820 		if (rv->inpartpopq) {
821 			TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
822 			rv->inpartpopq = FALSE;
823 		}
824 		vm_reserv_break(rv, NULL);
825 	}
826 	mtx_unlock(&vm_page_queue_free_mtx);
827 }
828 
829 /*
830  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
831  * page is freed and FALSE otherwise.
832  *
833  * The free page queue lock must be held.
834  */
835 boolean_t
836 vm_reserv_free_page(vm_page_t m)
837 {
838 	vm_reserv_t rv;
839 
840 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
841 	rv = vm_reserv_from_page(m);
842 	if (rv->object == NULL)
843 		return (FALSE);
844 	vm_reserv_depopulate(rv, m - rv->pages);
845 	return (TRUE);
846 }
847 
848 /*
849  * Initializes the reservation management system.  Specifically, initializes
850  * the reservation array.
851  *
852  * Requires that vm_page_array and first_page are initialized!
853  */
854 void
855 vm_reserv_init(void)
856 {
857 	vm_paddr_t paddr;
858 	struct vm_phys_seg *seg;
859 	int segind;
860 
861 	/*
862 	 * Initialize the reservation array.  Specifically, initialize the
863 	 * "pages" field for every element that has an underlying superpage.
864 	 */
865 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
866 		seg = &vm_phys_segs[segind];
867 		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
868 		while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
869 			vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
870 			    PHYS_TO_VM_PAGE(paddr);
871 			paddr += VM_LEVEL_0_SIZE;
872 		}
873 	}
874 }
875 
876 /*
877  * Returns true if the given page belongs to a reservation and that page is
878  * free.  Otherwise, returns false.
879  */
880 bool
881 vm_reserv_is_page_free(vm_page_t m)
882 {
883 	vm_reserv_t rv;
884 
885 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
886 	rv = vm_reserv_from_page(m);
887 	if (rv->object == NULL)
888 		return (false);
889 	return (popmap_is_clear(rv->popmap, m - rv->pages));
890 }
891 
892 /*
893  * If the given page belongs to a reservation, returns the level of that
894  * reservation.  Otherwise, returns -1.
895  */
896 int
897 vm_reserv_level(vm_page_t m)
898 {
899 	vm_reserv_t rv;
900 
901 	rv = vm_reserv_from_page(m);
902 	return (rv->object != NULL ? 0 : -1);
903 }
904 
905 /*
906  * Returns a reservation level if the given page belongs to a fully populated
907  * reservation and -1 otherwise.
908  */
909 int
910 vm_reserv_level_iffullpop(vm_page_t m)
911 {
912 	vm_reserv_t rv;
913 
914 	rv = vm_reserv_from_page(m);
915 	return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
916 }
917 
918 /*
919  * Breaks the given partially populated reservation, releasing its free pages
920  * to the physical memory allocator.
921  *
922  * The free page queue lock must be held.
923  */
924 static void
925 vm_reserv_reclaim(vm_reserv_t rv)
926 {
927 
928 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
929 	KASSERT(rv->inpartpopq,
930 	    ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
931 	TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
932 	rv->inpartpopq = FALSE;
933 	vm_reserv_break(rv, NULL);
934 	vm_reserv_reclaimed++;
935 }
936 
937 /*
938  * Breaks the reservation at the head of the partially populated reservation
939  * queue, releasing its free pages to the physical memory allocator.  Returns
940  * TRUE if a reservation is broken and FALSE otherwise.
941  *
942  * The free page queue lock must be held.
943  */
944 boolean_t
945 vm_reserv_reclaim_inactive(void)
946 {
947 	vm_reserv_t rv;
948 
949 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
950 	if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) {
951 		vm_reserv_reclaim(rv);
952 		return (TRUE);
953 	}
954 	return (FALSE);
955 }
956 
957 /*
958  * Searches the partially populated reservation queue for the least recently
959  * changed reservation with free pages that satisfy the given request for
960  * contiguous physical memory.  If a satisfactory reservation is found, it is
961  * broken.  Returns TRUE if a reservation is broken and FALSE otherwise.
962  *
963  * The free page queue lock must be held.
964  */
965 boolean_t
966 vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
967     u_long alignment, vm_paddr_t boundary)
968 {
969 	vm_paddr_t pa, size;
970 	vm_reserv_t rv;
971 	int hi, i, lo, low_index, next_free;
972 
973 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
974 	if (npages > VM_LEVEL_0_NPAGES - 1)
975 		return (FALSE);
976 	size = npages << PAGE_SHIFT;
977 	TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) {
978 		pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
979 		if (pa + PAGE_SIZE - size < low) {
980 			/* This entire reservation is too low; go to next. */
981 			continue;
982 		}
983 		pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
984 		if (pa + size > high) {
985 			/* This entire reservation is too high; go to next. */
986 			continue;
987 		}
988 		if (pa < low) {
989 			/* Start the search for free pages at "low". */
990 			low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT;
991 			i = low_index / NBPOPMAP;
992 			hi = low_index % NBPOPMAP;
993 		} else
994 			i = hi = 0;
995 		do {
996 			/* Find the next free page. */
997 			lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
998 			while (lo == 0 && ++i < NPOPMAP)
999 				lo = ffsl(~rv->popmap[i]);
1000 			if (i == NPOPMAP)
1001 				break;
1002 			/* Convert from ffsl() to ordinary bit numbering. */
1003 			lo--;
1004 			next_free = NBPOPMAP * i + lo;
1005 			pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]);
1006 			KASSERT(pa >= low,
1007 			    ("vm_reserv_reclaim_contig: pa is too low"));
1008 			if (pa + size > high) {
1009 				/* The rest of this reservation is too high. */
1010 				break;
1011 			} else if ((pa & (alignment - 1)) != 0 ||
1012 			    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
1013 				/*
1014 				 * The current page doesn't meet the alignment
1015 				 * and/or boundary requirements.  Continue
1016 				 * searching this reservation until the rest
1017 				 * of its free pages are either excluded or
1018 				 * exhausted.
1019 				 */
1020 				hi = lo + 1;
1021 				if (hi >= NBPOPMAP) {
1022 					hi = 0;
1023 					i++;
1024 				}
1025 				continue;
1026 			}
1027 			/* Find the next used page. */
1028 			hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1));
1029 			while (hi == 0 && ++i < NPOPMAP) {
1030 				if ((NBPOPMAP * i - next_free) * PAGE_SIZE >=
1031 				    size) {
1032 					vm_reserv_reclaim(rv);
1033 					return (TRUE);
1034 				}
1035 				hi = ffsl(rv->popmap[i]);
1036 			}
1037 			/* Convert from ffsl() to ordinary bit numbering. */
1038 			if (i != NPOPMAP)
1039 				hi--;
1040 			if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >=
1041 			    size) {
1042 				vm_reserv_reclaim(rv);
1043 				return (TRUE);
1044 			}
1045 		} while (i < NPOPMAP);
1046 	}
1047 	return (FALSE);
1048 }
1049 
1050 /*
1051  * Transfers the reservation underlying the given page to a new object.
1052  *
1053  * The object must be locked.
1054  */
1055 void
1056 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
1057     vm_pindex_t old_object_offset)
1058 {
1059 	vm_reserv_t rv;
1060 
1061 	VM_OBJECT_ASSERT_WLOCKED(new_object);
1062 	rv = vm_reserv_from_page(m);
1063 	if (rv->object == old_object) {
1064 		mtx_lock(&vm_page_queue_free_mtx);
1065 		if (rv->object == old_object) {
1066 			LIST_REMOVE(rv, objq);
1067 			LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
1068 			rv->object = new_object;
1069 			rv->pindex -= old_object_offset;
1070 		}
1071 		mtx_unlock(&vm_page_queue_free_mtx);
1072 	}
1073 }
1074 
1075 /*
1076  * Returns the size (in bytes) of a reservation of the specified level.
1077  */
1078 int
1079 vm_reserv_size(int level)
1080 {
1081 
1082 	switch (level) {
1083 	case 0:
1084 		return (VM_LEVEL_0_SIZE);
1085 	case -1:
1086 		return (PAGE_SIZE);
1087 	default:
1088 		return (0);
1089 	}
1090 }
1091 
1092 /*
1093  * Allocates the virtual and physical memory required by the reservation
1094  * management system's data structures, in particular, the reservation array.
1095  */
1096 vm_paddr_t
1097 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water)
1098 {
1099 	vm_paddr_t new_end;
1100 	size_t size;
1101 
1102 	/*
1103 	 * Calculate the size (in bytes) of the reservation array.  Round up
1104 	 * from "high_water" because every small page is mapped to an element
1105 	 * in the reservation array based on its physical address.  Thus, the
1106 	 * number of elements in the reservation array can be greater than the
1107 	 * number of superpages.
1108 	 */
1109 	size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv);
1110 
1111 	/*
1112 	 * Allocate and map the physical memory for the reservation array.  The
1113 	 * next available virtual address is returned by reference.
1114 	 */
1115 	new_end = end - round_page(size);
1116 	vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
1117 	    VM_PROT_READ | VM_PROT_WRITE);
1118 	bzero(vm_reserv_array, size);
1119 
1120 	/*
1121 	 * Return the next available physical address.
1122 	 */
1123 	return (new_end);
1124 }
1125 
1126 /*
1127  * Returns the superpage containing the given page.
1128  */
1129 vm_page_t
1130 vm_reserv_to_superpage(vm_page_t m)
1131 {
1132 	vm_reserv_t rv;
1133 
1134 	VM_OBJECT_ASSERT_LOCKED(m->object);
1135 	rv = vm_reserv_from_page(m);
1136 	return (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES ?
1137 	    rv->pages : NULL);
1138 }
1139 
1140 #endif	/* VM_NRESERVLEVEL > 0 */
1141