1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright 2024 Oxide Computer Company
28 */
29
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/errno.h>
34 #include <sys/kmem.h>
35 #include <sys/vnode.h>
36 #include <sys/vfs_opreg.h>
37 #include <sys/swap.h>
38 #include <sys/sysmacros.h>
39 #include <sys/buf.h>
40 #include <sys/callb.h>
41 #include <sys/debug.h>
42 #include <vm/seg.h>
43 #include <sys/fs/swapnode.h>
44 #include <fs/fs_subr.h>
45 #include <sys/cmn_err.h>
46 #include <sys/mem_config.h>
47 #include <sys/atomic.h>
48
49 extern const fs_operation_def_t swap_vnodeops_template[];
50
51 /*
52 * swapfs_minfree is the amount of physical memory (actually remaining
53 * availrmem) that we want to keep free for the rest of the system. This
54 * means that swapfs can only grow to availrmem - swapfs_minfree. This
55 * can be set as just constant value or a certain percentage of installed
56 * physical memory. It is set in swapinit().
57 *
58 * Users who want to change the amount of memory that can be used as swap
59 * space should do so by setting swapfs_desfree at boot time,
60 * not swapfs_minfree.
61 */
62
63 pgcnt_t swapfs_desfree = 0;
64 pgcnt_t swapfs_minfree = 0;
65 pgcnt_t swapfs_reserve = 0;
66
67 #ifdef SWAPFS_DEBUG
68 int swapfs_debug;
69 #endif /* SWAPFS_DEBUG */
70
71
72 static int swapfs_vpcount;
73 static kmutex_t swapfs_lock;
74 static struct async_reqs *sw_ar, *sw_pendlist, *sw_freelist;
75
76 static struct vnode **swap_vnodes; /* ptr's to swap vnodes */
77
78 static void swap_init_mem_config(void);
79
80 static pgcnt_t initial_swapfs_desfree;
81 static pgcnt_t initial_swapfs_minfree;
82 static pgcnt_t initial_swapfs_reserve;
83
84 static int swap_sync(struct vfs *vfsp, short flag, struct cred *cr);
85
86 static void
swapfs_recalc_save_initial(void)87 swapfs_recalc_save_initial(void)
88 {
89 initial_swapfs_desfree = swapfs_desfree;
90 initial_swapfs_minfree = swapfs_minfree;
91 initial_swapfs_reserve = swapfs_reserve;
92 }
93
94 static int
swapfs_recalc(pgcnt_t pgs)95 swapfs_recalc(pgcnt_t pgs)
96 {
97 pgcnt_t new_swapfs_desfree;
98 pgcnt_t new_swapfs_minfree;
99 pgcnt_t new_swapfs_reserve;
100
101 new_swapfs_desfree = initial_swapfs_desfree;
102 new_swapfs_minfree = initial_swapfs_minfree;
103 new_swapfs_reserve = initial_swapfs_reserve;
104
105 if (new_swapfs_desfree == 0)
106 new_swapfs_desfree = btopr(7 * 512 * 1024); /* 3-1/2Mb */;
107
108 if (new_swapfs_minfree == 0) {
109 /*
110 * Set swapfs_minfree to be an eighth of physical, but
111 * capped at 512 MiB.
112 */
113 new_swapfs_minfree = MIN(btopr(512 * 1024 * 1024), pgs >> 3);
114 }
115
116 /*
117 * priv processes can reserve memory as swap as long as availrmem
118 * remains greater than swapfs_minfree; in the case of non-priv
119 * processes, memory can be reserved as swap only if availrmem
120 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
121 * swapfs_reserve amount of memswap is not available to non-priv
122 * processes. This protects daemons such as automounter dying
123 * as a result of application processes eating away almost entire
124 * membased swap. This safeguard becomes useless if apps are run
125 * with root access.
126 *
127 * set swapfs_reserve to a minimum of 4Mb or 1/128 of physmem whichever
128 * is greater up to the limit of 128 MB.
129 */
130 if (new_swapfs_reserve == 0)
131 new_swapfs_reserve = MIN(btopr(128 * 1024 * 1024),
132 MAX(btopr(4 * 1024 * 1024), pgs >> 7));
133
134 /* Test basic numeric viability. */
135 if (new_swapfs_minfree > pgs)
136 return (0);
137
138 /* Equivalent test to anon_resvmem() check. */
139 if (availrmem < new_swapfs_minfree) {
140 /*
141 * If ism pages are being used, then there must be agreement
142 * between these two policies.
143 */
144 if ((availrmem > segspt_minfree) && (segspt_minfree > 0)) {
145 new_swapfs_minfree = segspt_minfree;
146 } else {
147 return (0);
148 }
149 }
150
151 swapfs_desfree = new_swapfs_desfree;
152 swapfs_minfree = new_swapfs_minfree;
153 swapfs_reserve = new_swapfs_reserve;
154
155 return (1);
156 }
157
158 /*ARGSUSED1*/
159 int
swapinit(int fstype,char * name)160 swapinit(int fstype, char *name)
161 {
162 /* reserve for mp */
163 ssize_t sw_freelist_size = klustsize / PAGESIZE * 2;
164 int i, error;
165
166 static const fs_operation_def_t swap_vfsops[] = {
167 VFSNAME_SYNC, { .vfs_sync = swap_sync },
168 NULL, NULL
169 };
170
171 SWAPFS_PRINT(SWAP_SUBR, "swapinit\n", 0, 0, 0, 0, 0);
172 mutex_init(&swapfs_lock, NULL, MUTEX_DEFAULT, NULL);
173
174 swap_vnodes = kmem_zalloc(MAX_SWAP_VNODES * sizeof (struct vnode *),
175 KM_SLEEP);
176
177 swapfs_recalc_save_initial();
178 if (!swapfs_recalc(physmem))
179 cmn_err(CE_PANIC, "swapfs_minfree(%lu) > physmem(%lu)",
180 swapfs_minfree, physmem);
181
182 /*
183 * Arrange for a callback on memory size change.
184 */
185 swap_init_mem_config();
186
187 sw_ar = (struct async_reqs *)
188 kmem_zalloc(sw_freelist_size*sizeof (struct async_reqs), KM_SLEEP);
189
190 error = vfs_setfsops(fstype, swap_vfsops, NULL);
191 if (error != 0) {
192 cmn_err(CE_WARN, "swapinit: bad vfs ops template");
193 return (error);
194 }
195
196 error = vn_make_ops(name, swap_vnodeops_template, &swap_vnodeops);
197 if (error != 0) {
198 (void) vfs_freevfsops_by_type(fstype);
199 cmn_err(CE_WARN, "swapinit: bad vnode ops template");
200 return (error);
201 }
202 sw_freelist = sw_ar;
203 for (i = 0; i < sw_freelist_size - 1; i++)
204 sw_ar[i].a_next = &sw_ar[i + 1];
205
206 return (0);
207 }
208
209 /*
210 * Get a swapfs vnode corresponding to the specified identifier.
211 */
212 struct vnode *
swapfs_getvp(ulong_t vidx)213 swapfs_getvp(ulong_t vidx)
214 {
215 struct vnode *vp;
216
217 vp = swap_vnodes[vidx];
218 if (vp) {
219 return (vp);
220 }
221
222 mutex_enter(&swapfs_lock);
223 vp = swap_vnodes[vidx];
224 if (vp == NULL) {
225 vp = vn_alloc(KM_SLEEP);
226 vn_setops(vp, swap_vnodeops);
227 vp->v_type = VREG;
228 vp->v_flag |= (VISSWAP|VISSWAPFS);
229 swap_vnodes[vidx] = vp;
230 swapfs_vpcount++;
231 }
232 mutex_exit(&swapfs_lock);
233 return (vp);
234 }
235
236 int swap_lo;
237
238 /*ARGSUSED*/
239 static int
swap_sync(struct vfs * vfsp,short flag,struct cred * cr)240 swap_sync(struct vfs *vfsp, short flag, struct cred *cr)
241 {
242 struct vnode *vp;
243 int i;
244
245 if (!(flag & SYNC_ALL))
246 return (1);
247
248 /*
249 * assumes that we are the only one left to access this so that
250 * no need to use swapfs_lock (since it's staticly defined)
251 */
252 for (i = 0; i < MAX_SWAP_VNODES; i++) {
253 vp = swap_vnodes[i];
254 if (vp) {
255 VN_HOLD(vp);
256 (void) VOP_PUTPAGE(vp, (offset_t)0, 0,
257 (B_ASYNC | B_FREE), kcred, NULL);
258 VN_RELE(vp);
259 }
260 }
261 return (0);
262 }
263
264 extern int sw_pending_size;
265
266 /*
267 * Take an async request off the pending queue
268 */
269 struct async_reqs *
sw_getreq()270 sw_getreq()
271 {
272 struct async_reqs *arg;
273
274 mutex_enter(&swapfs_lock);
275 arg = sw_pendlist;
276 if (arg) {
277 sw_pendlist = arg->a_next;
278 arg->a_next = NULL;
279 sw_pending_size -= PAGESIZE;
280 }
281 ASSERT(sw_pending_size >= 0);
282 mutex_exit(&swapfs_lock);
283 return (arg);
284 }
285
286 /*
287 * Put an async request on the pending queue
288 */
289 void
sw_putreq(struct async_reqs * arg)290 sw_putreq(struct async_reqs *arg)
291 {
292 /* Hold onto it */
293 VN_HOLD(arg->a_vp);
294
295 mutex_enter(&swapfs_lock);
296 arg->a_next = sw_pendlist;
297 sw_pendlist = arg;
298 sw_pending_size += PAGESIZE;
299 mutex_exit(&swapfs_lock);
300 }
301
302 /*
303 * Put an async request back on the pending queue
304 */
305 void
sw_putbackreq(struct async_reqs * arg)306 sw_putbackreq(struct async_reqs *arg)
307 {
308 mutex_enter(&swapfs_lock);
309 arg->a_next = sw_pendlist;
310 sw_pendlist = arg;
311 sw_pending_size += PAGESIZE;
312 mutex_exit(&swapfs_lock);
313 }
314
315 /*
316 * Take an async request structure off the free list
317 */
318 struct async_reqs *
sw_getfree()319 sw_getfree()
320 {
321 struct async_reqs *arg;
322
323 mutex_enter(&swapfs_lock);
324 arg = sw_freelist;
325 if (arg) {
326 sw_freelist = arg->a_next;
327 arg->a_next = NULL;
328 }
329 mutex_exit(&swapfs_lock);
330 return (arg);
331 }
332
333 /*
334 * Put an async request structure on the free list
335 */
336 void
sw_putfree(struct async_reqs * arg)337 sw_putfree(struct async_reqs *arg)
338 {
339 /* Release our hold - should have locked the page by now */
340 VN_RELE(arg->a_vp);
341
342 mutex_enter(&swapfs_lock);
343 arg->a_next = sw_freelist;
344 sw_freelist = arg;
345 mutex_exit(&swapfs_lock);
346 }
347
348 static pgcnt_t swapfs_pending_delete;
349
350 /*ARGSUSED*/
351 static void
swap_mem_config_post_add(void * arg,pgcnt_t delta_swaps)352 swap_mem_config_post_add(
353 void *arg,
354 pgcnt_t delta_swaps)
355 {
356 (void) swapfs_recalc(physmem - swapfs_pending_delete);
357 }
358
359 /*ARGSUSED*/
360 static int
swap_mem_config_pre_del(void * arg,pgcnt_t delta_swaps)361 swap_mem_config_pre_del(
362 void *arg,
363 pgcnt_t delta_swaps)
364 {
365 pgcnt_t nv;
366
367 nv = atomic_add_long_nv(&swapfs_pending_delete, (spgcnt_t)delta_swaps);
368 if (!swapfs_recalc(physmem - nv)) {
369 /*
370 * Tidy-up is done by the call to post_del which
371 * is always made.
372 */
373 cmn_err(CE_NOTE, "Memory operation refused to ensure system "
374 "doesn't deadlock due to excessive consumption by swapfs.");
375 return (EBUSY);
376 }
377 return (0);
378 }
379
380 /*ARGSUSED*/
381 static void
swap_mem_config_post_del(void * arg,pgcnt_t delta_swaps,int cancelled)382 swap_mem_config_post_del(
383 void *arg,
384 pgcnt_t delta_swaps,
385 int cancelled)
386 {
387 pgcnt_t nv;
388
389 nv = atomic_add_long_nv(&swapfs_pending_delete, -(spgcnt_t)delta_swaps);
390 (void) swapfs_recalc(physmem - nv);
391 }
392
393 static kphysm_setup_vector_t swap_mem_config_vec = {
394 KPHYSM_SETUP_VECTOR_VERSION,
395 swap_mem_config_post_add,
396 swap_mem_config_pre_del,
397 swap_mem_config_post_del,
398 };
399
400 static void
swap_init_mem_config(void)401 swap_init_mem_config(void)
402 {
403 int ret;
404
405 ret = kphysm_setup_func_register(&swap_mem_config_vec, (void *)NULL);
406 ASSERT(ret == 0);
407 }
408