1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2017, Jeffrey Roberson <jeff@freebsd.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 * 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_vm.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/bitset.h> 36 #include <sys/domainset.h> 37 #include <sys/proc.h> 38 #include <sys/lock.h> 39 #include <sys/mutex.h> 40 #include <sys/malloc.h> 41 #include <sys/rwlock.h> 42 #include <sys/pctrie.h> 43 #include <sys/vmmeter.h> 44 45 #include <vm/vm.h> 46 #include <vm/vm_param.h> 47 #include <vm/vm_domainset.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_page.h> 50 #include <vm/vm_phys.h> 51 52 #ifdef NUMA 53 /* 54 * Iterators are written such that the first nowait pass has as short a 55 * codepath as possible to eliminate bloat from the allocator. It is 56 * assumed that most allocations are successful. 57 */ 58 59 static int vm_domainset_default_stride = 64; 60 61 static bool vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain); 62 63 64 /* 65 * Determine which policy is to be used for this allocation. 66 */ 67 static void 68 vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds, 69 int *iter, struct vm_object *obj, vm_pindex_t pindex) 70 { 71 72 di->di_domain = ds; 73 di->di_iter = iter; 74 di->di_policy = ds->ds_policy; 75 DOMAINSET_COPY(&ds->ds_mask, &di->di_valid_mask); 76 if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) { 77 #if VM_NRESERVLEVEL > 0 78 if (vm_object_reserv(obj)) { 79 /* 80 * Color the pindex so we end up on the correct 81 * reservation boundary. 82 */ 83 pindex += obj->pg_color; 84 #if VM_NRESERVLEVEL > 1 85 pindex >>= VM_LEVEL_1_ORDER; 86 #endif 87 pindex >>= VM_LEVEL_0_ORDER; 88 } else 89 #endif 90 pindex /= vm_domainset_default_stride; 91 /* 92 * Offset pindex so the first page of each object does 93 * not end up in domain 0. 94 */ 95 if (obj != NULL) 96 pindex += (((uintptr_t)obj) / sizeof(*obj)); 97 di->di_offset = pindex; 98 } 99 } 100 101 static void 102 vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain) 103 { 104 105 /* Grab the next domain in 'ds_order'. */ 106 *domain = di->di_domain->ds_order[ 107 (*di->di_iter)++ % di->di_domain->ds_cnt]; 108 } 109 110 static void 111 vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain) 112 { 113 int d; 114 115 d = di->di_offset % di->di_domain->ds_cnt; 116 *domain = di->di_domain->ds_order[d]; 117 } 118 119 /* 120 * Internal function determining the current phase's first candidate domain. 121 * 122 * Returns whether these is an eligible domain, which is returned through 123 * '*domain'. '*domain' can be modified even if there is no eligible domain. 124 * 125 * See herald comment of vm_domainset_iter_first() below about phases. 126 */ 127 static bool 128 vm_domainset_iter_phase_first(struct vm_domainset_iter *di, int *domain) 129 { 130 switch (di->di_policy) { 131 case DOMAINSET_POLICY_FIRSTTOUCH: 132 *domain = PCPU_GET(domain); 133 break; 134 case DOMAINSET_POLICY_ROUNDROBIN: 135 vm_domainset_iter_rr(di, domain); 136 break; 137 case DOMAINSET_POLICY_PREFER: 138 *domain = di->di_domain->ds_prefer; 139 break; 140 case DOMAINSET_POLICY_INTERLEAVE: 141 vm_domainset_iter_interleave(di, domain); 142 break; 143 default: 144 panic("%s: Unknown policy %d", __func__, di->di_policy); 145 } 146 KASSERT(*domain < vm_ndomains, 147 ("%s: Invalid domain %d", __func__, *domain)); 148 149 /* 150 * Has the policy's start domain already been visited? 151 */ 152 if (!DOMAINSET_ISSET(*domain, &di->di_remain_mask)) 153 return (vm_domainset_iter_next(di, domain)); 154 155 DOMAINSET_CLR(*domain, &di->di_remain_mask); 156 157 /* Does it have enough free pages (phase 1)? */ 158 if (di->di_minskip && vm_page_count_min_domain(*domain)) { 159 /* Mark the domain as eligible for phase 2. */ 160 DOMAINSET_SET(*domain, &di->di_min_mask); 161 return (vm_domainset_iter_next(di, domain)); 162 } 163 164 return (true); 165 } 166 167 /* 168 * Resets an iterator to point to the first candidate domain. 169 * 170 * Returns whether there is an eligible domain to start with. '*domain' may be 171 * modified even if there is none. 172 * 173 * There must have been one call to vm_domainset_iter_init() before. 174 * 175 * This function must be called at least once before calling 176 * vm_domainset_iter_next(). Note that functions wrapping 177 * vm_domainset_iter_init() usually do that themselves. 178 * 179 * This function may be called again to reset the iterator to the policy's first 180 * candidate domain. After each reset, the iterator will visit the same domains 181 * as in the previous iteration minus those on which vm_domainset_iter_ignore() 182 * has been called. Note that the first candidate domain may change at each 183 * reset (at time of this writing, only on the DOMAINSET_POLICY_ROUNDROBIN 184 * policy). 185 * 186 * Domains which have a number of free pages over 'v_free_min' are always 187 * visited first (this is called the "phase 1" in comments, "phase 2" being the 188 * examination of the remaining domains; no domains are ever visited twice). 189 */ 190 static bool 191 vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) 192 { 193 /* Initialize the mask of domains to visit. */ 194 DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask); 195 /* 196 * No candidate domains for phase 2 at start. This will be filled by 197 * phase 1. 198 */ 199 DOMAINSET_ZERO(&di->di_min_mask); 200 /* Skip domains below 'v_free_min' on phase 1. */ 201 di->di_minskip = true; 202 203 return (vm_domainset_iter_phase_first(di, domain)); 204 } 205 206 /* 207 * Advances the iterator to the next candidate domain. 208 * 209 * Returns whether there was another domain to visit. '*domain' may be modified 210 * even if there is none. 211 * 212 * vm_domainset_iter_first() must have been called at least once before using 213 * this function (see its herald comment for more details on iterators). 214 */ 215 static bool 216 vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) 217 { 218 /* Loop while there remains domains to visit in the current phase. */ 219 while (!DOMAINSET_EMPTY(&di->di_remain_mask)) { 220 /* Grab the next domain in 'ds_order'. */ 221 vm_domainset_iter_rr(di, domain); 222 KASSERT(*domain < vm_ndomains, 223 ("%s: Invalid domain %d", __func__, *domain)); 224 225 if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) { 226 DOMAINSET_CLR(*domain, &di->di_remain_mask); 227 if (!di->di_minskip || !vm_page_count_min_domain(*domain)) 228 return (true); 229 DOMAINSET_SET(*domain, &di->di_min_mask); 230 } 231 } 232 233 /* 234 * If phase 1 (skip low memory domains) is over, start phase 2 (consider 235 * low memory domains). 236 */ 237 if (di->di_minskip) { 238 di->di_minskip = false; 239 /* Browse domains that were under 'v_free_min'. */ 240 DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask); 241 return (vm_domainset_iter_phase_first(di, domain)); 242 } 243 244 return (false); 245 } 246 247 int 248 vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, 249 vm_pindex_t pindex, int *domain, int *req) 250 { 251 struct domainset_ref *dr; 252 253 di->di_flags = *req; 254 *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | 255 VM_ALLOC_NOWAIT; 256 257 /* 258 * Object policy takes precedence over thread policy. The policies 259 * are immutable and unsynchronized. Updates can race but pointer 260 * loads are assumed to be atomic. 261 */ 262 if (obj != NULL && obj->domain.dr_policy != NULL) { 263 /* 264 * This write lock protects non-atomic increments of the 265 * iterator index in vm_domainset_iter_rr(). 266 */ 267 VM_OBJECT_ASSERT_WLOCKED(obj); 268 dr = &obj->domain; 269 } else 270 dr = &curthread->td_domain; 271 272 vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex); 273 /* 274 * XXXOC: Shouldn't we just panic on 'false' if VM_ALLOC_WAITOK was 275 * passed? 276 */ 277 return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); 278 } 279 280 int 281 vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, 282 int *domain, struct pctrie_iter *pages) 283 { 284 if (vm_domainset_iter_next(di, domain)) 285 return (0); 286 287 /* If we visited all domains and this was a NOWAIT we return error. */ 288 if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0) 289 return (ENOMEM); 290 291 /* Wait for one of the domains to accumulate some free pages. */ 292 if (obj != NULL) { 293 VM_OBJECT_WUNLOCK(obj); 294 if (pages != NULL) 295 pctrie_iter_reset(pages); 296 } 297 vm_wait_doms(&di->di_valid_mask, 0); 298 if (obj != NULL) 299 VM_OBJECT_WLOCK(obj); 300 if ((di->di_flags & VM_ALLOC_WAITFAIL) != 0) 301 return (ENOMEM); 302 303 /* Restart the search. */ 304 /* XXXOC: Shouldn't we just panic on 'false'? */ 305 return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); 306 } 307 308 static int 309 _vm_domainset_iter_policy_init(struct vm_domainset_iter *di, int *domain, 310 int *flags) 311 { 312 di->di_flags = *flags; 313 *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT; 314 /* XXXOC: Shouldn't we just panic on 'false' if M_WAITOK was passed? */ 315 return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); 316 } 317 318 int 319 vm_domainset_iter_policy_init(struct vm_domainset_iter *di, 320 struct domainset *ds, int *domain, int *flags) 321 { 322 323 vm_domainset_iter_init(di, ds, &curthread->td_domain.dr_iter, NULL, 0); 324 return (_vm_domainset_iter_policy_init(di, domain, flags)); 325 } 326 327 int 328 vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di, 329 struct domainset_ref *dr, int *domain, int *flags) 330 { 331 332 vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, NULL, 0); 333 return (_vm_domainset_iter_policy_init(di, domain, flags)); 334 } 335 336 int 337 vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) 338 { 339 if (vm_domainset_iter_next(di, domain)) 340 return (0); 341 342 /* If we visited all domains and this was a NOWAIT we return error. */ 343 if ((di->di_flags & M_WAITOK) == 0) 344 return (ENOMEM); 345 346 /* Wait for one of the domains to accumulate some free pages. */ 347 vm_wait_doms(&di->di_valid_mask, 0); 348 349 /* Restart the search. */ 350 /* XXXOC: Shouldn't we just panic on 'false'? */ 351 return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); 352 } 353 354 void 355 vm_domainset_iter_ignore(struct vm_domainset_iter *di, int domain) 356 { 357 KASSERT(DOMAINSET_ISSET(domain, &di->di_valid_mask), 358 ("%s: domain %d not present in di_valid_mask for di %p", 359 __func__, domain, di)); 360 DOMAINSET_CLR(domain, &di->di_valid_mask); 361 } 362 363 #else /* !NUMA */ 364 365 int 366 vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, 367 int *domain, struct pctrie_iter *pages) 368 { 369 370 return (EJUSTRETURN); 371 } 372 373 int 374 vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, 375 vm_pindex_t pindex, int *domain, int *flags) 376 { 377 *domain = 0; 378 return (0); 379 } 380 381 int 382 vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) 383 { 384 385 return (EJUSTRETURN); 386 } 387 388 int 389 vm_domainset_iter_policy_init(struct vm_domainset_iter *di, 390 struct domainset *ds, int *domain, int *flags) 391 { 392 *domain = 0; 393 return (0); 394 } 395 396 int 397 vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di, 398 struct domainset_ref *dr, int *domain, int *flags) 399 { 400 *domain = 0; 401 return (0); 402 } 403 404 void 405 vm_domainset_iter_ignore(struct vm_domainset_iter *di __unused, 406 int domain __unused) 407 { 408 } 409 410 #endif /* NUMA */ 411