1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 2024 Oxide Computer Company 28 */ 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/errno.h> 34 #include <sys/kmem.h> 35 #include <sys/vnode.h> 36 #include <sys/vfs_opreg.h> 37 #include <sys/swap.h> 38 #include <sys/sysmacros.h> 39 #include <sys/buf.h> 40 #include <sys/callb.h> 41 #include <sys/debug.h> 42 #include <vm/seg.h> 43 #include <sys/fs/swapnode.h> 44 #include <fs/fs_subr.h> 45 #include <sys/cmn_err.h> 46 #include <sys/mem_config.h> 47 #include <sys/atomic.h> 48 49 extern const fs_operation_def_t swap_vnodeops_template[]; 50 51 /* 52 * swapfs_minfree is the amount of physical memory (actually remaining 53 * availrmem) that we want to keep free for the rest of the system. This 54 * means that swapfs can only grow to availrmem - swapfs_minfree. This 55 * can be set as just constant value or a certain percentage of installed 56 * physical memory. It is set in swapinit(). 57 * 58 * Users who want to change the amount of memory that can be used as swap 59 * space should do so by setting swapfs_desfree at boot time, 60 * not swapfs_minfree. 61 */ 62 63 pgcnt_t swapfs_desfree = 0; 64 pgcnt_t swapfs_minfree = 0; 65 pgcnt_t swapfs_reserve = 0; 66 67 #ifdef SWAPFS_DEBUG 68 int swapfs_debug; 69 #endif /* SWAPFS_DEBUG */ 70 71 72 static int swapfs_vpcount; 73 static kmutex_t swapfs_lock; 74 static struct async_reqs *sw_ar, *sw_pendlist, *sw_freelist; 75 76 static struct vnode **swap_vnodes; /* ptr's to swap vnodes */ 77 78 static void swap_init_mem_config(void); 79 80 static pgcnt_t initial_swapfs_desfree; 81 static pgcnt_t initial_swapfs_minfree; 82 static pgcnt_t initial_swapfs_reserve; 83 84 static int swap_sync(struct vfs *vfsp, short flag, struct cred *cr); 85 86 static void 87 swapfs_recalc_save_initial(void) 88 { 89 initial_swapfs_desfree = swapfs_desfree; 90 initial_swapfs_minfree = swapfs_minfree; 91 initial_swapfs_reserve = swapfs_reserve; 92 } 93 94 static int 95 swapfs_recalc(pgcnt_t pgs) 96 { 97 pgcnt_t new_swapfs_desfree; 98 pgcnt_t new_swapfs_minfree; 99 pgcnt_t new_swapfs_reserve; 100 101 new_swapfs_desfree = initial_swapfs_desfree; 102 new_swapfs_minfree = initial_swapfs_minfree; 103 new_swapfs_reserve = initial_swapfs_reserve; 104 105 if (new_swapfs_desfree == 0) 106 new_swapfs_desfree = btopr(7 * 512 * 1024); /* 3-1/2Mb */; 107 108 if (new_swapfs_minfree == 0) { 109 /* 110 * Set swapfs_minfree to be an eighth of physical, but 111 * capped at 512 MiB. 112 */ 113 new_swapfs_minfree = MIN(btopr(512 * 1024 * 1024), pgs >> 3); 114 } 115 116 /* 117 * priv processes can reserve memory as swap as long as availrmem 118 * remains greater than swapfs_minfree; in the case of non-priv 119 * processes, memory can be reserved as swap only if availrmem 120 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 121 * swapfs_reserve amount of memswap is not available to non-priv 122 * processes. This protects daemons such as automounter dying 123 * as a result of application processes eating away almost entire 124 * membased swap. This safeguard becomes useless if apps are run 125 * with root access. 126 * 127 * set swapfs_reserve to a minimum of 4Mb or 1/128 of physmem whichever 128 * is greater up to the limit of 128 MB. 129 */ 130 if (new_swapfs_reserve == 0) 131 new_swapfs_reserve = MIN(btopr(128 * 1024 * 1024), 132 MAX(btopr(4 * 1024 * 1024), pgs >> 7)); 133 134 /* Test basic numeric viability. */ 135 if (new_swapfs_minfree > pgs) 136 return (0); 137 138 /* Equivalent test to anon_resvmem() check. */ 139 if (availrmem < new_swapfs_minfree) { 140 /* 141 * If ism pages are being used, then there must be agreement 142 * between these two policies. 143 */ 144 if ((availrmem > segspt_minfree) && (segspt_minfree > 0)) { 145 new_swapfs_minfree = segspt_minfree; 146 } else { 147 return (0); 148 } 149 } 150 151 swapfs_desfree = new_swapfs_desfree; 152 swapfs_minfree = new_swapfs_minfree; 153 swapfs_reserve = new_swapfs_reserve; 154 155 return (1); 156 } 157 158 /*ARGSUSED1*/ 159 int 160 swapinit(int fstype, char *name) 161 { 162 /* reserve for mp */ 163 ssize_t sw_freelist_size = klustsize / PAGESIZE * 2; 164 int i, error; 165 166 static const fs_operation_def_t swap_vfsops[] = { 167 VFSNAME_SYNC, { .vfs_sync = swap_sync }, 168 NULL, NULL 169 }; 170 171 SWAPFS_PRINT(SWAP_SUBR, "swapinit\n", 0, 0, 0, 0, 0); 172 mutex_init(&swapfs_lock, NULL, MUTEX_DEFAULT, NULL); 173 174 swap_vnodes = kmem_zalloc(MAX_SWAP_VNODES * sizeof (struct vnode *), 175 KM_SLEEP); 176 177 swapfs_recalc_save_initial(); 178 if (!swapfs_recalc(physmem)) 179 cmn_err(CE_PANIC, "swapfs_minfree(%lu) > physmem(%lu)", 180 swapfs_minfree, physmem); 181 182 /* 183 * Arrange for a callback on memory size change. 184 */ 185 swap_init_mem_config(); 186 187 sw_ar = (struct async_reqs *) 188 kmem_zalloc(sw_freelist_size*sizeof (struct async_reqs), KM_SLEEP); 189 190 error = vfs_setfsops(fstype, swap_vfsops, NULL); 191 if (error != 0) { 192 cmn_err(CE_WARN, "swapinit: bad vfs ops template"); 193 return (error); 194 } 195 196 error = vn_make_ops(name, swap_vnodeops_template, &swap_vnodeops); 197 if (error != 0) { 198 (void) vfs_freevfsops_by_type(fstype); 199 cmn_err(CE_WARN, "swapinit: bad vnode ops template"); 200 return (error); 201 } 202 sw_freelist = sw_ar; 203 for (i = 0; i < sw_freelist_size - 1; i++) 204 sw_ar[i].a_next = &sw_ar[i + 1]; 205 206 return (0); 207 } 208 209 /* 210 * Get a swapfs vnode corresponding to the specified identifier. 211 */ 212 struct vnode * 213 swapfs_getvp(ulong_t vidx) 214 { 215 struct vnode *vp; 216 217 vp = swap_vnodes[vidx]; 218 if (vp) { 219 return (vp); 220 } 221 222 mutex_enter(&swapfs_lock); 223 vp = swap_vnodes[vidx]; 224 if (vp == NULL) { 225 vp = vn_alloc(KM_SLEEP); 226 vn_setops(vp, swap_vnodeops); 227 vp->v_type = VREG; 228 vp->v_flag |= (VISSWAP|VISSWAPFS); 229 swap_vnodes[vidx] = vp; 230 swapfs_vpcount++; 231 } 232 mutex_exit(&swapfs_lock); 233 return (vp); 234 } 235 236 int swap_lo; 237 238 /*ARGSUSED*/ 239 static int 240 swap_sync(struct vfs *vfsp, short flag, struct cred *cr) 241 { 242 struct vnode *vp; 243 int i; 244 245 if (!(flag & SYNC_ALL)) 246 return (1); 247 248 /* 249 * assumes that we are the only one left to access this so that 250 * no need to use swapfs_lock (since it's staticly defined) 251 */ 252 for (i = 0; i < MAX_SWAP_VNODES; i++) { 253 vp = swap_vnodes[i]; 254 if (vp) { 255 VN_HOLD(vp); 256 (void) VOP_PUTPAGE(vp, (offset_t)0, 0, 257 (B_ASYNC | B_FREE), kcred, NULL); 258 VN_RELE(vp); 259 } 260 } 261 return (0); 262 } 263 264 extern int sw_pending_size; 265 266 /* 267 * Take an async request off the pending queue 268 */ 269 struct async_reqs * 270 sw_getreq() 271 { 272 struct async_reqs *arg; 273 274 mutex_enter(&swapfs_lock); 275 arg = sw_pendlist; 276 if (arg) { 277 sw_pendlist = arg->a_next; 278 arg->a_next = NULL; 279 sw_pending_size -= PAGESIZE; 280 } 281 ASSERT(sw_pending_size >= 0); 282 mutex_exit(&swapfs_lock); 283 return (arg); 284 } 285 286 /* 287 * Put an async request on the pending queue 288 */ 289 void 290 sw_putreq(struct async_reqs *arg) 291 { 292 /* Hold onto it */ 293 VN_HOLD(arg->a_vp); 294 295 mutex_enter(&swapfs_lock); 296 arg->a_next = sw_pendlist; 297 sw_pendlist = arg; 298 sw_pending_size += PAGESIZE; 299 mutex_exit(&swapfs_lock); 300 } 301 302 /* 303 * Put an async request back on the pending queue 304 */ 305 void 306 sw_putbackreq(struct async_reqs *arg) 307 { 308 mutex_enter(&swapfs_lock); 309 arg->a_next = sw_pendlist; 310 sw_pendlist = arg; 311 sw_pending_size += PAGESIZE; 312 mutex_exit(&swapfs_lock); 313 } 314 315 /* 316 * Take an async request structure off the free list 317 */ 318 struct async_reqs * 319 sw_getfree() 320 { 321 struct async_reqs *arg; 322 323 mutex_enter(&swapfs_lock); 324 arg = sw_freelist; 325 if (arg) { 326 sw_freelist = arg->a_next; 327 arg->a_next = NULL; 328 } 329 mutex_exit(&swapfs_lock); 330 return (arg); 331 } 332 333 /* 334 * Put an async request structure on the free list 335 */ 336 void 337 sw_putfree(struct async_reqs *arg) 338 { 339 /* Release our hold - should have locked the page by now */ 340 VN_RELE(arg->a_vp); 341 342 mutex_enter(&swapfs_lock); 343 arg->a_next = sw_freelist; 344 sw_freelist = arg; 345 mutex_exit(&swapfs_lock); 346 } 347 348 static pgcnt_t swapfs_pending_delete; 349 350 /*ARGSUSED*/ 351 static void 352 swap_mem_config_post_add( 353 void *arg, 354 pgcnt_t delta_swaps) 355 { 356 (void) swapfs_recalc(physmem - swapfs_pending_delete); 357 } 358 359 /*ARGSUSED*/ 360 static int 361 swap_mem_config_pre_del( 362 void *arg, 363 pgcnt_t delta_swaps) 364 { 365 pgcnt_t nv; 366 367 nv = atomic_add_long_nv(&swapfs_pending_delete, (spgcnt_t)delta_swaps); 368 if (!swapfs_recalc(physmem - nv)) { 369 /* 370 * Tidy-up is done by the call to post_del which 371 * is always made. 372 */ 373 cmn_err(CE_NOTE, "Memory operation refused to ensure system " 374 "doesn't deadlock due to excessive consumption by swapfs."); 375 return (EBUSY); 376 } 377 return (0); 378 } 379 380 /*ARGSUSED*/ 381 static void 382 swap_mem_config_post_del( 383 void *arg, 384 pgcnt_t delta_swaps, 385 int cancelled) 386 { 387 pgcnt_t nv; 388 389 nv = atomic_add_long_nv(&swapfs_pending_delete, -(spgcnt_t)delta_swaps); 390 (void) swapfs_recalc(physmem - nv); 391 } 392 393 static kphysm_setup_vector_t swap_mem_config_vec = { 394 KPHYSM_SETUP_VECTOR_VERSION, 395 swap_mem_config_post_add, 396 swap_mem_config_pre_del, 397 swap_mem_config_post_del, 398 }; 399 400 static void 401 swap_init_mem_config(void) 402 { 403 int ret; 404 405 ret = kphysm_setup_func_register(&swap_mem_config_vec, (void *)NULL); 406 ASSERT(ret == 0); 407 } 408