xref: /freebsd/sys/vm/vm_domainset.c (revision 637d9858e6a8b4a8a3ee4dd80743a58bde4cbd68)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2017,	Jeffrey Roberson <jeff@freebsd.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  */
29 
30 #include <sys/cdefs.h>
31 #include "opt_vm.h"
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/bitset.h>
36 #include <sys/domainset.h>
37 #include <sys/proc.h>
38 #include <sys/lock.h>
39 #include <sys/mutex.h>
40 #include <sys/malloc.h>
41 #include <sys/rwlock.h>
42 #include <sys/pctrie.h>
43 #include <sys/vmmeter.h>
44 
45 #include <vm/vm.h>
46 #include <vm/vm_param.h>
47 #include <vm/vm_domainset.h>
48 #include <vm/vm_object.h>
49 #include <vm/vm_page.h>
50 #include <vm/vm_phys.h>
51 
52 #ifdef NUMA
53 /*
54  * Iterators are written such that the first nowait pass has as short a
55  * codepath as possible to eliminate bloat from the allocator.  It is
56  * assumed that most allocations are successful.
57  */
58 
59 static int vm_domainset_default_stride = 64;
60 
61 static bool vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain);
62 
63 
64 /*
65  * Determine which policy is to be used for this allocation.
66  */
67 static void
vm_domainset_iter_init(struct vm_domainset_iter * di,struct domainset * ds,int * iter,struct vm_object * obj,vm_pindex_t pindex)68 vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds,
69     int *iter, struct vm_object *obj, vm_pindex_t pindex)
70 {
71 
72 	di->di_domain = ds;
73 	di->di_iter = iter;
74 	di->di_policy = ds->ds_policy;
75 	DOMAINSET_COPY(&ds->ds_mask, &di->di_valid_mask);
76 	if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) {
77 #if VM_NRESERVLEVEL > 0
78 		if (vm_object_reserv(obj)) {
79 			/*
80 			 * Color the pindex so we end up on the correct
81 			 * reservation boundary.
82 			 */
83 			pindex += obj->pg_color;
84 #if VM_NRESERVLEVEL > 1
85 			pindex >>= VM_LEVEL_1_ORDER;
86 #endif
87 			pindex >>= VM_LEVEL_0_ORDER;
88 		} else
89 #endif
90 			pindex /= vm_domainset_default_stride;
91 		/*
92 		 * Offset pindex so the first page of each object does
93 		 * not end up in domain 0.
94 		 */
95 		if (obj != NULL)
96 			pindex += (((uintptr_t)obj) / sizeof(*obj));
97 		di->di_offset = pindex;
98 	}
99 }
100 
101 static void
vm_domainset_iter_rr(struct vm_domainset_iter * di,int * domain)102 vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain)
103 {
104 
105 	/* Grab the next domain in 'ds_order'. */
106 	*domain = di->di_domain->ds_order[
107 	    (*di->di_iter)++ % di->di_domain->ds_cnt];
108 }
109 
110 static void
vm_domainset_iter_interleave(struct vm_domainset_iter * di,int * domain)111 vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain)
112 {
113 	int d;
114 
115 	d = di->di_offset % di->di_domain->ds_cnt;
116 	*di->di_iter = d;
117 	*domain = di->di_domain->ds_order[d];
118 }
119 
120 /*
121  * Internal function determining the current phase's first candidate domain.
122  *
123  * Returns whether these is an eligible domain, which is returned through
124  * '*domain'.  '*domain' can be modified even if there is no eligible domain.
125  *
126  * See herald comment of vm_domainset_iter_first() below about phases.
127  */
128 static bool
vm_domainset_iter_phase_first(struct vm_domainset_iter * di,int * domain)129 vm_domainset_iter_phase_first(struct vm_domainset_iter *di, int *domain)
130 {
131 	switch (di->di_policy) {
132 	case DOMAINSET_POLICY_FIRSTTOUCH:
133 		*domain = PCPU_GET(domain);
134 		break;
135 	case DOMAINSET_POLICY_ROUNDROBIN:
136 		vm_domainset_iter_rr(di, domain);
137 		break;
138 	case DOMAINSET_POLICY_PREFER:
139 		*domain = di->di_domain->ds_prefer;
140 		break;
141 	case DOMAINSET_POLICY_INTERLEAVE:
142 		vm_domainset_iter_interleave(di, domain);
143 		break;
144 	default:
145 		panic("%s: Unknown policy %d", __func__, di->di_policy);
146 	}
147 	KASSERT(*domain < vm_ndomains,
148 	    ("%s: Invalid domain %d", __func__, *domain));
149 
150 	/*
151 	 * Has the policy's start domain already been visited?
152 	 */
153 	if (!DOMAINSET_ISSET(*domain, &di->di_remain_mask))
154 		return (vm_domainset_iter_next(di, domain));
155 
156 	DOMAINSET_CLR(*domain, &di->di_remain_mask);
157 
158 	/* Does it have enough free pages (phase 1)? */
159 	if (di->di_minskip && vm_page_count_min_domain(*domain)) {
160 		/* Mark the domain as eligible for phase 2. */
161 		DOMAINSET_SET(*domain, &di->di_min_mask);
162 		return (vm_domainset_iter_next(di, domain));
163 	}
164 
165 	return (true);
166 }
167 
168 /*
169  * Resets an iterator to point to the first candidate domain.
170  *
171  * Returns whether there is an eligible domain to start with.  '*domain' may be
172  * modified even if there is none.
173  *
174  * There must have been one call to vm_domainset_iter_init() before.
175  *
176  * This function must be called at least once before calling
177  * vm_domainset_iter_next().  Note that functions wrapping
178  * vm_domainset_iter_init() usually do that themselves.
179  *
180  * This function may be called again to reset the iterator to the policy's first
181  * candidate domain.  After each reset, the iterator will visit the same domains
182  * as in the previous iteration minus those on which vm_domainset_iter_ignore()
183  * has been called.  Note that the first candidate domain may change at each
184  * reset (at time of this writing, only on the DOMAINSET_POLICY_ROUNDROBIN
185  * policy).
186  *
187  * Domains which have a number of free pages over 'v_free_min' are always
188  * visited first (this is called the "phase 1" in comments, "phase 2" being the
189  * examination of the remaining domains; no domains are ever visited twice).
190  */
191 static bool
vm_domainset_iter_first(struct vm_domainset_iter * di,int * domain)192 vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)
193 {
194 	/* Initialize the mask of domains to visit. */
195 	DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask);
196 	/*
197 	 * No candidate domains for phase 2 at start.  This will be filled by
198 	 * phase 1.
199 	 */
200 	DOMAINSET_ZERO(&di->di_min_mask);
201 	/* Skip domains below 'v_free_min' on phase 1. */
202 	di->di_minskip = true;
203 
204 	return (vm_domainset_iter_phase_first(di, domain));
205 }
206 
207 /*
208  * Advances the iterator to the next candidate domain.
209  *
210  * Returns whether there was another domain to visit.  '*domain' may be modified
211  * even if there is none.
212  *
213  * vm_domainset_iter_first() must have been called at least once before using
214  * this function (see its herald comment for more details on iterators).
215  */
216 static bool
vm_domainset_iter_next(struct vm_domainset_iter * di,int * domain)217 vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
218 {
219 	/* Loop while there remains domains to visit in the current phase. */
220 	while (!DOMAINSET_EMPTY(&di->di_remain_mask)) {
221 		/* Grab the next domain in 'ds_order'. */
222 		vm_domainset_iter_rr(di, domain);
223 		KASSERT(*domain < vm_ndomains,
224 		    ("%s: Invalid domain %d", __func__, *domain));
225 
226 		if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) {
227 			DOMAINSET_CLR(*domain, &di->di_remain_mask);
228 			if (!di->di_minskip || !vm_page_count_min_domain(*domain))
229 				return (true);
230 			DOMAINSET_SET(*domain, &di->di_min_mask);
231 		}
232 	}
233 
234 	/*
235 	 * If phase 1 (skip low memory domains) is over, start phase 2 (consider
236 	 * low memory domains).
237 	 */
238 	if (di->di_minskip) {
239 		di->di_minskip = false;
240 		/* Browse domains that were under 'v_free_min'. */
241 		DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask);
242 		return (vm_domainset_iter_phase_first(di, domain));
243 	}
244 
245 	return (false);
246 }
247 
248 int
vm_domainset_iter_page_init(struct vm_domainset_iter * di,struct vm_object * obj,vm_pindex_t pindex,int * domain,int * req)249 vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
250     vm_pindex_t pindex, int *domain, int *req)
251 {
252 	struct domainset_ref *dr;
253 
254 	di->di_flags = *req;
255 	*req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) |
256 	    VM_ALLOC_NOWAIT;
257 
258 	/*
259 	 * Object policy takes precedence over thread policy.  The policies
260 	 * are immutable and unsynchronized.  Updates can race but pointer
261 	 * loads are assumed to be atomic.
262 	 */
263 	if (obj != NULL && obj->domain.dr_policy != NULL)
264 		dr = &obj->domain;
265 	else
266 		dr = &curthread->td_domain;
267 
268 	vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex);
269 	/*
270 	 * XXXOC: Shouldn't we just panic on 'false' if VM_ALLOC_WAITOK was
271 	 * passed?
272 	 */
273 	return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
274 }
275 
276 int
vm_domainset_iter_page(struct vm_domainset_iter * di,struct vm_object * obj,int * domain,struct pctrie_iter * pages)277 vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
278     int *domain, struct pctrie_iter *pages)
279 {
280 	if (vm_domainset_iter_next(di, domain))
281 		return (0);
282 
283 	/* If we visited all domains and this was a NOWAIT we return error. */
284 	if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0)
285 		return (ENOMEM);
286 
287 	/* Wait for one of the domains to accumulate some free pages. */
288 	if (obj != NULL) {
289 		VM_OBJECT_WUNLOCK(obj);
290 		if (pages != NULL)
291 			pctrie_iter_reset(pages);
292 	}
293 	vm_wait_doms(&di->di_valid_mask, 0);
294 	if (obj != NULL)
295 		VM_OBJECT_WLOCK(obj);
296 	if ((di->di_flags & VM_ALLOC_WAITFAIL) != 0)
297 		return (ENOMEM);
298 
299 	/* Restart the search. */
300 	/* XXXOC: Shouldn't we just panic on 'false'? */
301 	return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
302 }
303 
304 static int
_vm_domainset_iter_policy_init(struct vm_domainset_iter * di,int * domain,int * flags)305 _vm_domainset_iter_policy_init(struct vm_domainset_iter *di, int *domain,
306     int *flags)
307 {
308 	di->di_flags = *flags;
309 	*flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT;
310 	/* XXXOC: Shouldn't we just panic on 'false' if M_WAITOK was passed? */
311 	return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
312 }
313 
314 int
vm_domainset_iter_policy_init(struct vm_domainset_iter * di,struct domainset * ds,int * domain,int * flags)315 vm_domainset_iter_policy_init(struct vm_domainset_iter *di,
316     struct domainset *ds, int *domain, int *flags)
317 {
318 
319 	vm_domainset_iter_init(di, ds, &curthread->td_domain.dr_iter, NULL, 0);
320 	return (_vm_domainset_iter_policy_init(di, domain, flags));
321 }
322 
323 int
vm_domainset_iter_policy_ref_init(struct vm_domainset_iter * di,struct domainset_ref * dr,int * domain,int * flags)324 vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,
325     struct domainset_ref *dr, int *domain, int *flags)
326 {
327 
328 	vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, NULL, 0);
329 	return (_vm_domainset_iter_policy_init(di, domain, flags));
330 }
331 
332 int
vm_domainset_iter_policy(struct vm_domainset_iter * di,int * domain)333 vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
334 {
335 	if (vm_domainset_iter_next(di, domain))
336 		return (0);
337 
338 	/* If we visited all domains and this was a NOWAIT we return error. */
339 	if ((di->di_flags & M_WAITOK) == 0)
340 		return (ENOMEM);
341 
342 	/* Wait for one of the domains to accumulate some free pages. */
343 	vm_wait_doms(&di->di_valid_mask, 0);
344 
345 	/* Restart the search. */
346 	/* XXXOC: Shouldn't we just panic on 'false'? */
347 	return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
348 }
349 
350 void
vm_domainset_iter_ignore(struct vm_domainset_iter * di,int domain)351 vm_domainset_iter_ignore(struct vm_domainset_iter *di, int domain)
352 {
353 	KASSERT(DOMAINSET_ISSET(domain, &di->di_valid_mask),
354 	    ("%s: domain %d not present in di_valid_mask for di %p",
355 	    __func__, domain, di));
356 	DOMAINSET_CLR(domain, &di->di_valid_mask);
357 }
358 
359 #else /* !NUMA */
360 
361 int
vm_domainset_iter_page(struct vm_domainset_iter * di,struct vm_object * obj,int * domain,struct pctrie_iter * pages)362 vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
363     int *domain, struct pctrie_iter *pages)
364 {
365 
366 	return (EJUSTRETURN);
367 }
368 
369 int
vm_domainset_iter_page_init(struct vm_domainset_iter * di,struct vm_object * obj,vm_pindex_t pindex,int * domain,int * flags)370 vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
371     vm_pindex_t pindex, int *domain, int *flags)
372 {
373 	*domain = 0;
374 	return (0);
375 }
376 
377 int
vm_domainset_iter_policy(struct vm_domainset_iter * di,int * domain)378 vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
379 {
380 
381 	return (EJUSTRETURN);
382 }
383 
384 int
vm_domainset_iter_policy_init(struct vm_domainset_iter * di,struct domainset * ds,int * domain,int * flags)385 vm_domainset_iter_policy_init(struct vm_domainset_iter *di,
386     struct domainset *ds, int *domain, int *flags)
387 {
388 	*domain = 0;
389 	return (0);
390 }
391 
392 int
vm_domainset_iter_policy_ref_init(struct vm_domainset_iter * di,struct domainset_ref * dr,int * domain,int * flags)393 vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,
394     struct domainset_ref *dr, int *domain, int *flags)
395 {
396 	*domain = 0;
397 	return (0);
398 }
399 
400 void
vm_domainset_iter_ignore(struct vm_domainset_iter * di __unused,int domain __unused)401 vm_domainset_iter_ignore(struct vm_domainset_iter *di __unused,
402     int domain __unused)
403 {
404 }
405 
406 #endif /* NUMA */
407