1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2017, Jeffrey Roberson <jeff@freebsd.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
12 * disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 */
29
30 #include <sys/cdefs.h>
31 #include "opt_vm.h"
32
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/bitset.h>
36 #include <sys/domainset.h>
37 #include <sys/proc.h>
38 #include <sys/lock.h>
39 #include <sys/mutex.h>
40 #include <sys/malloc.h>
41 #include <sys/rwlock.h>
42 #include <sys/pctrie.h>
43 #include <sys/vmmeter.h>
44
45 #include <vm/vm.h>
46 #include <vm/vm_param.h>
47 #include <vm/vm_domainset.h>
48 #include <vm/vm_object.h>
49 #include <vm/vm_page.h>
50 #include <vm/vm_phys.h>
51
52 #ifdef NUMA
53 /*
54 * Iterators are written such that the first nowait pass has as short a
55 * codepath as possible to eliminate bloat from the allocator. It is
56 * assumed that most allocations are successful.
57 */
58
59 static int vm_domainset_default_stride = 64;
60
61 static bool vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain);
62
63
64 /*
65 * Determine which policy is to be used for this allocation.
66 */
67 static void
vm_domainset_iter_init(struct vm_domainset_iter * di,struct domainset * ds,int * iter,struct vm_object * obj,vm_pindex_t pindex)68 vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds,
69 int *iter, struct vm_object *obj, vm_pindex_t pindex)
70 {
71
72 di->di_domain = ds;
73 di->di_iter = iter;
74 di->di_policy = ds->ds_policy;
75 DOMAINSET_COPY(&ds->ds_mask, &di->di_valid_mask);
76 if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) {
77 #if VM_NRESERVLEVEL > 0
78 if (vm_object_reserv(obj)) {
79 /*
80 * Color the pindex so we end up on the correct
81 * reservation boundary.
82 */
83 pindex += obj->pg_color;
84 #if VM_NRESERVLEVEL > 1
85 pindex >>= VM_LEVEL_1_ORDER;
86 #endif
87 pindex >>= VM_LEVEL_0_ORDER;
88 } else
89 #endif
90 pindex /= vm_domainset_default_stride;
91 /*
92 * Offset pindex so the first page of each object does
93 * not end up in domain 0.
94 */
95 if (obj != NULL)
96 pindex += (((uintptr_t)obj) / sizeof(*obj));
97 di->di_offset = pindex;
98 }
99 }
100
101 static void
vm_domainset_iter_rr(struct vm_domainset_iter * di,int * domain)102 vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain)
103 {
104
105 /* Grab the next domain in 'ds_order'. */
106 *domain = di->di_domain->ds_order[
107 (*di->di_iter)++ % di->di_domain->ds_cnt];
108 }
109
110 static void
vm_domainset_iter_interleave(struct vm_domainset_iter * di,int * domain)111 vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain)
112 {
113 int d;
114
115 d = di->di_offset % di->di_domain->ds_cnt;
116 *di->di_iter = d;
117 *domain = di->di_domain->ds_order[d];
118 }
119
120 /*
121 * Internal function determining the current phase's first candidate domain.
122 *
123 * Returns whether these is an eligible domain, which is returned through
124 * '*domain'. '*domain' can be modified even if there is no eligible domain.
125 *
126 * See herald comment of vm_domainset_iter_first() below about phases.
127 */
128 static bool
vm_domainset_iter_phase_first(struct vm_domainset_iter * di,int * domain)129 vm_domainset_iter_phase_first(struct vm_domainset_iter *di, int *domain)
130 {
131 switch (di->di_policy) {
132 case DOMAINSET_POLICY_FIRSTTOUCH:
133 *domain = PCPU_GET(domain);
134 break;
135 case DOMAINSET_POLICY_ROUNDROBIN:
136 vm_domainset_iter_rr(di, domain);
137 break;
138 case DOMAINSET_POLICY_PREFER:
139 *domain = di->di_domain->ds_prefer;
140 break;
141 case DOMAINSET_POLICY_INTERLEAVE:
142 vm_domainset_iter_interleave(di, domain);
143 break;
144 default:
145 panic("%s: Unknown policy %d", __func__, di->di_policy);
146 }
147 KASSERT(*domain < vm_ndomains,
148 ("%s: Invalid domain %d", __func__, *domain));
149
150 /*
151 * Has the policy's start domain already been visited?
152 */
153 if (!DOMAINSET_ISSET(*domain, &di->di_remain_mask))
154 return (vm_domainset_iter_next(di, domain));
155
156 DOMAINSET_CLR(*domain, &di->di_remain_mask);
157
158 /* Does it have enough free pages (phase 1)? */
159 if (di->di_minskip && vm_page_count_min_domain(*domain)) {
160 /* Mark the domain as eligible for phase 2. */
161 DOMAINSET_SET(*domain, &di->di_min_mask);
162 return (vm_domainset_iter_next(di, domain));
163 }
164
165 return (true);
166 }
167
168 /*
169 * Resets an iterator to point to the first candidate domain.
170 *
171 * Returns whether there is an eligible domain to start with. '*domain' may be
172 * modified even if there is none.
173 *
174 * There must have been one call to vm_domainset_iter_init() before.
175 *
176 * This function must be called at least once before calling
177 * vm_domainset_iter_next(). Note that functions wrapping
178 * vm_domainset_iter_init() usually do that themselves.
179 *
180 * This function may be called again to reset the iterator to the policy's first
181 * candidate domain. After each reset, the iterator will visit the same domains
182 * as in the previous iteration minus those on which vm_domainset_iter_ignore()
183 * has been called. Note that the first candidate domain may change at each
184 * reset (at time of this writing, only on the DOMAINSET_POLICY_ROUNDROBIN
185 * policy).
186 *
187 * Domains which have a number of free pages over 'v_free_min' are always
188 * visited first (this is called the "phase 1" in comments, "phase 2" being the
189 * examination of the remaining domains; no domains are ever visited twice).
190 */
191 static bool
vm_domainset_iter_first(struct vm_domainset_iter * di,int * domain)192 vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)
193 {
194 /* Initialize the mask of domains to visit. */
195 DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask);
196 /*
197 * No candidate domains for phase 2 at start. This will be filled by
198 * phase 1.
199 */
200 DOMAINSET_ZERO(&di->di_min_mask);
201 /* Skip domains below 'v_free_min' on phase 1. */
202 di->di_minskip = true;
203
204 return (vm_domainset_iter_phase_first(di, domain));
205 }
206
207 /*
208 * Advances the iterator to the next candidate domain.
209 *
210 * Returns whether there was another domain to visit. '*domain' may be modified
211 * even if there is none.
212 *
213 * vm_domainset_iter_first() must have been called at least once before using
214 * this function (see its herald comment for more details on iterators).
215 */
216 static bool
vm_domainset_iter_next(struct vm_domainset_iter * di,int * domain)217 vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
218 {
219 /* Loop while there remains domains to visit in the current phase. */
220 while (!DOMAINSET_EMPTY(&di->di_remain_mask)) {
221 /* Grab the next domain in 'ds_order'. */
222 vm_domainset_iter_rr(di, domain);
223 KASSERT(*domain < vm_ndomains,
224 ("%s: Invalid domain %d", __func__, *domain));
225
226 if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) {
227 DOMAINSET_CLR(*domain, &di->di_remain_mask);
228 if (!di->di_minskip || !vm_page_count_min_domain(*domain))
229 return (true);
230 DOMAINSET_SET(*domain, &di->di_min_mask);
231 }
232 }
233
234 /*
235 * If phase 1 (skip low memory domains) is over, start phase 2 (consider
236 * low memory domains).
237 */
238 if (di->di_minskip) {
239 di->di_minskip = false;
240 /* Browse domains that were under 'v_free_min'. */
241 DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask);
242 return (vm_domainset_iter_phase_first(di, domain));
243 }
244
245 return (false);
246 }
247
248 int
vm_domainset_iter_page_init(struct vm_domainset_iter * di,struct vm_object * obj,vm_pindex_t pindex,int * domain,int * req)249 vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
250 vm_pindex_t pindex, int *domain, int *req)
251 {
252 struct domainset_ref *dr;
253
254 di->di_flags = *req;
255 *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) |
256 VM_ALLOC_NOWAIT;
257
258 /*
259 * Object policy takes precedence over thread policy. The policies
260 * are immutable and unsynchronized. Updates can race but pointer
261 * loads are assumed to be atomic.
262 */
263 if (obj != NULL && obj->domain.dr_policy != NULL)
264 dr = &obj->domain;
265 else
266 dr = &curthread->td_domain;
267
268 vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex);
269 /*
270 * XXXOC: Shouldn't we just panic on 'false' if VM_ALLOC_WAITOK was
271 * passed?
272 */
273 return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
274 }
275
276 int
vm_domainset_iter_page(struct vm_domainset_iter * di,struct vm_object * obj,int * domain,struct pctrie_iter * pages)277 vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
278 int *domain, struct pctrie_iter *pages)
279 {
280 if (vm_domainset_iter_next(di, domain))
281 return (0);
282
283 /* If we visited all domains and this was a NOWAIT we return error. */
284 if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0)
285 return (ENOMEM);
286
287 /* Wait for one of the domains to accumulate some free pages. */
288 if (obj != NULL) {
289 VM_OBJECT_WUNLOCK(obj);
290 if (pages != NULL)
291 pctrie_iter_reset(pages);
292 }
293 vm_wait_doms(&di->di_valid_mask, 0);
294 if (obj != NULL)
295 VM_OBJECT_WLOCK(obj);
296 if ((di->di_flags & VM_ALLOC_WAITFAIL) != 0)
297 return (ENOMEM);
298
299 /* Restart the search. */
300 /* XXXOC: Shouldn't we just panic on 'false'? */
301 return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
302 }
303
304 static int
_vm_domainset_iter_policy_init(struct vm_domainset_iter * di,int * domain,int * flags)305 _vm_domainset_iter_policy_init(struct vm_domainset_iter *di, int *domain,
306 int *flags)
307 {
308 di->di_flags = *flags;
309 *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT;
310 /* XXXOC: Shouldn't we just panic on 'false' if M_WAITOK was passed? */
311 return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
312 }
313
314 int
vm_domainset_iter_policy_init(struct vm_domainset_iter * di,struct domainset * ds,int * domain,int * flags)315 vm_domainset_iter_policy_init(struct vm_domainset_iter *di,
316 struct domainset *ds, int *domain, int *flags)
317 {
318
319 vm_domainset_iter_init(di, ds, &curthread->td_domain.dr_iter, NULL, 0);
320 return (_vm_domainset_iter_policy_init(di, domain, flags));
321 }
322
323 int
vm_domainset_iter_policy_ref_init(struct vm_domainset_iter * di,struct domainset_ref * dr,int * domain,int * flags)324 vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,
325 struct domainset_ref *dr, int *domain, int *flags)
326 {
327
328 vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, NULL, 0);
329 return (_vm_domainset_iter_policy_init(di, domain, flags));
330 }
331
332 int
vm_domainset_iter_policy(struct vm_domainset_iter * di,int * domain)333 vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
334 {
335 if (vm_domainset_iter_next(di, domain))
336 return (0);
337
338 /* If we visited all domains and this was a NOWAIT we return error. */
339 if ((di->di_flags & M_WAITOK) == 0)
340 return (ENOMEM);
341
342 /* Wait for one of the domains to accumulate some free pages. */
343 vm_wait_doms(&di->di_valid_mask, 0);
344
345 /* Restart the search. */
346 /* XXXOC: Shouldn't we just panic on 'false'? */
347 return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
348 }
349
350 void
vm_domainset_iter_ignore(struct vm_domainset_iter * di,int domain)351 vm_domainset_iter_ignore(struct vm_domainset_iter *di, int domain)
352 {
353 KASSERT(DOMAINSET_ISSET(domain, &di->di_valid_mask),
354 ("%s: domain %d not present in di_valid_mask for di %p",
355 __func__, domain, di));
356 DOMAINSET_CLR(domain, &di->di_valid_mask);
357 }
358
359 #else /* !NUMA */
360
361 int
vm_domainset_iter_page(struct vm_domainset_iter * di,struct vm_object * obj,int * domain,struct pctrie_iter * pages)362 vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
363 int *domain, struct pctrie_iter *pages)
364 {
365
366 return (EJUSTRETURN);
367 }
368
369 int
vm_domainset_iter_page_init(struct vm_domainset_iter * di,struct vm_object * obj,vm_pindex_t pindex,int * domain,int * flags)370 vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
371 vm_pindex_t pindex, int *domain, int *flags)
372 {
373 *domain = 0;
374 return (0);
375 }
376
377 int
vm_domainset_iter_policy(struct vm_domainset_iter * di,int * domain)378 vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
379 {
380
381 return (EJUSTRETURN);
382 }
383
384 int
vm_domainset_iter_policy_init(struct vm_domainset_iter * di,struct domainset * ds,int * domain,int * flags)385 vm_domainset_iter_policy_init(struct vm_domainset_iter *di,
386 struct domainset *ds, int *domain, int *flags)
387 {
388 *domain = 0;
389 return (0);
390 }
391
392 int
vm_domainset_iter_policy_ref_init(struct vm_domainset_iter * di,struct domainset_ref * dr,int * domain,int * flags)393 vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,
394 struct domainset_ref *dr, int *domain, int *flags)
395 {
396 *domain = 0;
397 return (0);
398 }
399
400 void
vm_domainset_iter_ignore(struct vm_domainset_iter * di __unused,int domain __unused)401 vm_domainset_iter_ignore(struct vm_domainset_iter *di __unused,
402 int domain __unused)
403 {
404 }
405
406 #endif /* NUMA */
407