xref: /titanic_51/usr/src/uts/common/vm/vm_swap.c (revision 381a2a9a387f449fab7d0c7e97c4184c26963abf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 /*
43  * Each physical swap area has an associated bitmap representing
44  * its physical storage. The bitmap records which swap slots are
45  * currently allocated or freed.  Allocation is done by searching
46  * through the bitmap for the first free slot. Thus, there's
47  * no linear relation between offset within the swap device and the
48  * address (within its segment(s)) of the page that the slot backs;
49  * instead, it's an arbitrary one-to-one mapping.
50  *
51  * Associated with each swap area is a swapinfo structure.  These
52  * structures are linked into a linear list that determines the
53  * ordering of swap areas in the logical swap device.  Each contains a
54  * pointer to the corresponding bitmap, the area's size, and its
55  * associated vnode.
56  */
57 
58 #include <sys/types.h>
59 #include <sys/inttypes.h>
60 #include <sys/param.h>
61 #include <sys/t_lock.h>
62 #include <sys/sysmacros.h>
63 #include <sys/systm.h>
64 #include <sys/errno.h>
65 #include <sys/kmem.h>
66 #include <sys/vfs.h>
67 #include <sys/vnode.h>
68 #include <sys/pathname.h>
69 #include <sys/cmn_err.h>
70 #include <sys/vtrace.h>
71 #include <sys/swap.h>
72 #include <sys/dumphdr.h>
73 #include <sys/debug.h>
74 #include <sys/fs/snode.h>
75 #include <sys/fs/swapnode.h>
76 #include <sys/policy.h>
77 #include <sys/zone.h>
78 
79 #include <vm/as.h>
80 #include <vm/seg.h>
81 #include <vm/page.h>
82 #include <vm/seg_vn.h>
83 #include <vm/hat.h>
84 #include <vm/anon.h>
85 #include <vm/seg_map.h>
86 
87 /*
88  * To balance the load among multiple swap areas, we don't allow
89  * more than swap_maxcontig allocations to be satisfied from a
90  * single swap area before moving on to the next swap area.  This
91  * effectively "interleaves" allocations among the many swap areas.
92  */
93 int swap_maxcontig;	/* set by anon_init() to 1 Mb */
94 
95 #define	MINIROOTSIZE	12000	/* ~6 Meg XXX */
96 
97 /*
98  * XXX - this lock is a kludge. It serializes some aspects of swapadd() and
99  * swapdel() (namely VOP_OPEN, VOP_CLOSE, VN_RELE).  It protects against
100  * somebody swapadd'ing and getting swap slots from a vnode, while someone
101  * else is in the process of closing or rele'ing it.
102  */
103 static kmutex_t swap_lock;
104 
105 kmutex_t swapinfo_lock;
106 
107 /*
108  * protected by the swapinfo_lock
109  */
110 struct swapinfo	*swapinfo;
111 
112 static	struct	swapinfo *silast;
113 static	int	nswapfiles;
114 
115 static u_offset_t	swap_getoff(struct swapinfo *);
116 static int	swapadd(struct vnode *, ulong_t, ulong_t, char *);
117 static int	swapdel(struct vnode *, ulong_t);
118 static int	swapslot_free(struct vnode *, u_offset_t, struct swapinfo *);
119 
120 /*
121  * swap device bitmap allocation macros
122  */
123 #define	MAPSHIFT	5
124 #define	NBBW		(NBPW * NBBY)	/* number of bits per word */
125 #define	TESTBIT(map, i)		(((map)[(i) >> MAPSHIFT] & (1 << (i) % NBBW)))
126 #define	SETBIT(map, i)		(((map)[(i) >> MAPSHIFT] |= (1 << (i) % NBBW)))
127 #define	CLEARBIT(map, i)	(((map)[(i) >> MAPSHIFT] &= ~(1 << (i) % NBBW)))
128 
129 int swap_debug = 0;	/* set for debug printf's */
130 int swap_verify = 0;	/* set to verify slots when freeing and allocating */
131 
132 uint_t swapalloc_maxcontig;
133 
134 /*
135  * Allocate a range of up to *lenp contiguous slots (page) from a physical
136  * swap device. Flags are one of:
137  *	SA_NOT  Must have a slot from a physical swap device other than the
138  * 		the one containing input (*vpp, *offp).
139  * Less slots than requested may be returned. *lenp allocated slots are
140  * returned starting at *offp on *vpp.
141  * Returns 1 for a successful allocation, 0 for couldn't allocate any slots.
142  */
143 int
144 swap_phys_alloc(
145 	struct vnode **vpp,
146 	u_offset_t *offp,
147 	size_t *lenp,
148 	uint_t flags)
149 {
150 	struct swapinfo *sip;
151 	offset_t soff, noff;
152 	size_t len;
153 
154 	mutex_enter(&swapinfo_lock);
155 	sip = silast;
156 
157 	/* Find a desirable physical device and allocate from it. */
158 	do {
159 		if (sip == NULL)
160 			break;
161 		if (!(sip->si_flags & ST_INDEL) &&
162 		    (spgcnt_t)sip->si_nfpgs > 0) {
163 			/* Caller wants other than specified swap device */
164 			if (flags & SA_NOT) {
165 				if (*vpp != sip->si_vp ||
166 				    *offp < sip->si_soff ||
167 				    *offp >= sip->si_eoff)
168 					goto found;
169 			/* Caller is loose, will take anything */
170 			} else
171 				goto found;
172 		} else if (sip->si_nfpgs == 0)
173 			sip->si_allocs = 0;
174 		if ((sip = sip->si_next) == NULL)
175 			sip = swapinfo;
176 	} while (sip != silast);
177 	mutex_exit(&swapinfo_lock);
178 	return (0);
179 found:
180 	soff = swap_getoff(sip);
181 	sip->si_nfpgs--;
182 	if (soff == -1)
183 		panic("swap_alloc: swap_getoff failed!");
184 
185 	for (len = PAGESIZE; len < *lenp; len += PAGESIZE) {
186 		if (sip->si_nfpgs == 0)
187 			break;
188 		if (swapalloc_maxcontig && len >= swapalloc_maxcontig)
189 			break;
190 		noff = swap_getoff(sip);
191 		if (noff == -1) {
192 			break;
193 		} else if (noff != soff + len) {
194 			CLEARBIT(sip->si_swapslots, btop(noff - sip->si_soff));
195 			break;
196 		}
197 		sip->si_nfpgs--;
198 	}
199 	*vpp = sip->si_vp;
200 	*offp = soff;
201 	*lenp = len;
202 	ASSERT((spgcnt_t)sip->si_nfpgs >= 0);
203 	sip->si_allocs += btop(len);
204 	if (sip->si_allocs >= swap_maxcontig) {
205 		sip->si_allocs = 0;
206 		if ((silast = sip->si_next) == NULL)
207 			silast = swapinfo;
208 	}
209 	TRACE_2(TR_FAC_VM, TR_SWAP_ALLOC,
210 		"swap_alloc:sip %p offset %lx", sip, soff);
211 	mutex_exit(&swapinfo_lock);
212 	return (1);
213 }
214 
215 int swap_backsearch = 0;
216 
217 /*
218  * Get a free offset on swap device sip.
219  * Return >=0 offset if succeeded, -1 for failure.
220  */
221 static u_offset_t
222 swap_getoff(struct swapinfo *sip)
223 {
224 	uint_t *sp, *ep;
225 	size_t aoff, boff, poff, slotnumber;
226 
227 	ASSERT(MUTEX_HELD(&swapinfo_lock));
228 
229 	sip->si_alloccnt++;
230 	for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
231 	    ep = &sip->si_swapslots[sip->si_mapsize / NBPW]; sp < ep; sp++) {
232 		if (*sp != (uint_t)0xffffffff)
233 			goto foundentry;
234 		else
235 			sip->si_checkcnt++;
236 	}
237 	SWAP_PRINT(SW_ALLOC,
238 	    "swap_getoff: couldn't find slot from hint %ld to end\n",
239 	    sip->si_hint, 0, 0, 0, 0);
240 	/*
241 	 * Go backwards? Check for faster method XXX
242 	 */
243 	if (swap_backsearch) {
244 		for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
245 		    ep = sip->si_swapslots; sp > ep; sp--) {
246 			if (*sp != (uint_t)0xffffffff)
247 				goto foundentry;
248 			else
249 				sip->si_checkcnt++;
250 		}
251 	} else {
252 		for (sp = sip->si_swapslots,
253 		    ep = &sip->si_swapslots[sip->si_hint >> MAPSHIFT];
254 		    sp < ep; sp++) {
255 			if (*sp != (uint_t)0xffffffff)
256 				goto foundentry;
257 			else
258 				sip->si_checkcnt++;
259 		}
260 	}
261 	if (*sp == 0xffffffff) {
262 		cmn_err(CE_WARN, "No free swap slots!");
263 		return ((u_offset_t)-1);
264 	}
265 
266 foundentry:
267 	/*
268 	 * aoff is the page number offset (in bytes) of the si_swapslots
269 	 * array element containing a free page
270 	 *
271 	 * boff is the page number offset of the free page
272 	 * (i.e. cleared bit) in si_swapslots[aoff].
273 	 */
274 	aoff = ((char *)sp - (char *)sip->si_swapslots) * NBBY;
275 
276 	for (boff = (sip->si_hint % NBBW); boff < NBBW; boff++) {
277 		if (!TESTBIT(sip->si_swapslots, aoff + boff))
278 			goto foundslot;
279 		else
280 			sip->si_checkcnt++;
281 	}
282 	for (boff = 0; boff < (sip->si_hint % NBBW); boff++) {
283 		if (!TESTBIT(sip->si_swapslots, aoff + boff))
284 			goto foundslot;
285 		else
286 			sip->si_checkcnt++;
287 	}
288 	panic("swap_getoff: didn't find slot in word hint %ld", sip->si_hint);
289 
290 foundslot:
291 	/*
292 	 * Return the offset of the free page in swap device.
293 	 * Convert page number of byte offset and add starting
294 	 * offset of swap device.
295 	 */
296 	slotnumber = aoff + boff;
297 	SWAP_PRINT(SW_ALLOC, "swap_getoff: allocating slot %ld\n",
298 	    slotnumber, 0, 0, 0, 0);
299 	poff = ptob(slotnumber);
300 	if (poff + sip->si_soff >= sip->si_eoff)
301 		printf("ptob(aoff(%ld) + boff(%ld))(%ld) >= eoff(%ld)\n",
302 		    aoff, boff, ptob(slotnumber), (long)sip->si_eoff);
303 	ASSERT(poff < sip->si_eoff);
304 	/*
305 	 * We could verify here that the slot isn't already allocated
306 	 * by looking through all the anon slots.
307 	 */
308 	SETBIT(sip->si_swapslots, slotnumber);
309 	sip->si_hint = slotnumber + 1;	/* hint = next slot */
310 	return (poff + sip->si_soff);
311 }
312 
313 /*
314  * Free a swap page.
315  */
316 void
317 swap_phys_free(struct vnode *vp, u_offset_t off, size_t len)
318 {
319 	struct swapinfo *sip;
320 	ssize_t pagenumber, npage;
321 
322 	mutex_enter(&swapinfo_lock);
323 	sip = swapinfo;
324 
325 	do {
326 		if (sip->si_vp == vp &&
327 		    sip->si_soff <= off && off < sip->si_eoff) {
328 			for (pagenumber = btop(off - sip->si_soff),
329 			    npage = btop(len) + pagenumber;
330 			    pagenumber < npage; pagenumber++) {
331 				SWAP_PRINT(SW_ALLOC,
332 				    "swap_phys_free: freeing slot %ld on "
333 				    "sip %p\n",
334 				    pagenumber, sip, 0, 0, 0);
335 				if (!TESTBIT(sip->si_swapslots, pagenumber)) {
336 					panic(
337 					    "swap_phys_free: freeing free slot "
338 					    "%p,%lx\n", (void *)vp,
339 					    ptob(pagenumber) + sip->si_soff);
340 				}
341 				CLEARBIT(sip->si_swapslots, pagenumber);
342 				sip->si_nfpgs++;
343 			}
344 			ASSERT(sip->si_nfpgs <= sip->si_npgs);
345 			mutex_exit(&swapinfo_lock);
346 			return;
347 		}
348 	} while ((sip = sip->si_next) != NULL);
349 	panic("swap_phys_free");
350 	/*NOTREACHED*/
351 }
352 
353 /*
354  * Return the anon struct corresponding for the given
355  * <vnode, off> if it is part of the virtual swap device.
356  * Return the anon struct if found, otherwise NULL.
357  */
358 struct anon *
359 swap_anon(struct vnode *vp, u_offset_t off)
360 {
361 	struct anon *ap;
362 
363 	ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(vp, off)]));
364 
365 	for (ap = anon_hash[ANON_HASH(vp, off)]; ap != NULL; ap = ap->an_hash) {
366 		if (ap->an_vp == vp && ap->an_off == off)
367 			return (ap);
368 	}
369 	return (NULL);
370 }
371 
372 
373 /*
374  * Determine if the vp offset range overlap a swap device.
375  */
376 int
377 swap_in_range(struct vnode *vp, u_offset_t offset, size_t len)
378 {
379 	struct swapinfo *sip;
380 	u_offset_t eoff;
381 
382 	eoff = offset + len;
383 	ASSERT(eoff > offset);
384 
385 	mutex_enter(&swapinfo_lock);
386 	sip = swapinfo;
387 	if (vp && sip) {
388 		do {
389 			if (vp != sip->si_vp || eoff <= sip->si_soff ||
390 			    offset >= sip->si_eoff)
391 				continue;
392 			mutex_exit(&swapinfo_lock);
393 			return (1);
394 		} while ((sip = sip->si_next) != NULL);
395 	}
396 	mutex_exit(&swapinfo_lock);
397 	return (0);
398 }
399 
400 /*
401  * See if name is one of our swap files
402  * even though lookupname failed.
403  * This can be used by swapdel to delete
404  * swap resources on remote machines
405  * where the link has gone down.
406  */
407 static struct vnode *
408 swapdel_byname(
409 	char 	*name,			/* pathname to delete */
410 	ulong_t lowblk) 	/* Low block number of area to delete */
411 {
412 	struct swapinfo **sipp, *osip;
413 	u_offset_t soff;
414 
415 	/*
416 	 * Find the swap file entry for the file to
417 	 * be deleted. Skip any entries that are in
418 	 * transition.
419 	 */
420 
421 	soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
422 
423 	mutex_enter(&swapinfo_lock);
424 	for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
425 		if ((strcmp(osip->si_pname, name) == 0) &&
426 		    (osip->si_soff == soff) && (osip->si_flags == 0)) {
427 			struct vnode *vp = osip->si_vp;
428 
429 			VN_HOLD(vp);
430 			mutex_exit(&swapinfo_lock);
431 			return (vp);
432 		}
433 	}
434 	mutex_exit(&swapinfo_lock);
435 	return (NULL);
436 }
437 
438 
439 /*
440  * New system call to manipulate swap files.
441  */
442 int
443 swapctl(int sc_cmd, void *sc_arg, int *rv)
444 {
445 	struct swapinfo *sip, *csip, *tsip;
446 	int error = 0;
447 	struct swapent st, *ust;
448 	struct swapres sr;
449 	struct vnode *vp;
450 	int cnt = 0;
451 	int tmp_nswapfiles;
452 	int nswap;
453 	int length, nlen;
454 	int gplen = 0, plen;
455 	char *swapname;
456 	char *pname;
457 	char *tpname;
458 	struct anoninfo ai;
459 	spgcnt_t avail;
460 	int global = INGLOBALZONE(curproc);
461 
462 	/*
463 	 * When running in a zone we want to hide the details of the swap
464 	 * devices: we report there only being one swap device named "swap"
465 	 * having a size equal to the sum of the sizes of all real swap devices
466 	 * on the system.
467 	 */
468 	switch (sc_cmd) {
469 	case SC_GETNSWP:
470 		if (global)
471 			*rv = nswapfiles;
472 		else
473 			*rv = 1;
474 		return (0);
475 
476 	case SC_AINFO:
477 		/*
478 		 * Return anoninfo information with these changes:
479 		 * ani_max = maximum amount of swap space
480 		 *	(including potentially available physical memory)
481 		 * ani_free = amount of unallocated anonymous memory
482 		 *	(some of which might be reserved and including
483 		 *	 potentially available physical memory)
484 		 * ani_resv = amount of claimed (reserved) anonymous memory
485 		 */
486 		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
487 		ai.ani_max = (k_anoninfo.ani_max +
488 			k_anoninfo.ani_mem_resv) +avail;
489 
490 		ai.ani_free = k_anoninfo.ani_free + avail;
491 
492 		ai.ani_resv = k_anoninfo.ani_phys_resv +
493 		    k_anoninfo.ani_mem_resv;
494 
495 		if (copyout(&ai, sc_arg, sizeof (struct anoninfo)) != 0)
496 			return (EFAULT);
497 		return (0);
498 
499 	case SC_LIST:
500 		if (copyin(sc_arg, &length, sizeof (int)) != 0)
501 			return (EFAULT);
502 		if (!global) {
503 			struct swapent st;
504 			char *swappath = "swap";
505 
506 			if (length < 1)
507 				return (ENOMEM);
508 			ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
509 			if (copyin(ust, &st, sizeof (swapent_t)) != 0)
510 				return (EFAULT);
511 			st.ste_start = PAGESIZE >> SCTRSHFT;
512 			st.ste_length = (off_t)0;
513 			st.ste_pages = 0;
514 			st.ste_free = 0;
515 			st.ste_flags = 0;
516 			mutex_enter(&swapinfo_lock);
517 			for (sip = swapinfo, nswap = 0;
518 			    sip != NULL && nswap < nswapfiles;
519 			    sip = sip->si_next, nswap++) {
520 				st.ste_length +=
521 				    (sip->si_eoff - sip->si_soff) >> SCTRSHFT;
522 				st.ste_pages += sip->si_npgs;
523 				st.ste_free += sip->si_nfpgs;
524 			}
525 			mutex_exit(&swapinfo_lock);
526 			if (copyout(&st, ust, sizeof (swapent_t)) != 0 ||
527 			    copyout(swappath, st.ste_path,
528 				    strlen(swappath) + 1) != 0) {
529 				return (EFAULT);
530 			}
531 			*rv = 1;
532 			return (0);
533 		}
534 beginning:
535 		tmp_nswapfiles = nswapfiles;
536 		/* Return an error if not enough space for the whole table. */
537 		if (length < tmp_nswapfiles)
538 			return (ENOMEM);
539 		/*
540 		 * Get memory to hold the swap entries and their names. We'll
541 		 * copy the real entries into these and then copy these out.
542 		 * Allocating the pathname memory is only a guess so we may
543 		 * find that we need more and have to do it again.
544 		 * All this is because we have to hold the anon lock while
545 		 * traversing the swapinfo list, and we can't be doing copyouts
546 		 * and/or kmem_alloc()s during this.
547 		 */
548 		csip = kmem_zalloc(tmp_nswapfiles * sizeof (struct swapinfo),
549 		    KM_SLEEP);
550 retry:
551 		nlen = tmp_nswapfiles * (gplen += 100);
552 		pname = kmem_zalloc(nlen, KM_SLEEP);
553 
554 		mutex_enter(&swapinfo_lock);
555 
556 		if (tmp_nswapfiles != nswapfiles) {
557 			mutex_exit(&swapinfo_lock);
558 			kmem_free(pname, nlen);
559 			kmem_free(csip,
560 			    tmp_nswapfiles * sizeof (struct swapinfo));
561 			gplen = 0;
562 			goto beginning;
563 		}
564 		for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
565 		    sip && nswap < tmp_nswapfiles;
566 		    sip = sip->si_next, tsip++, tpname += plen, nswap++) {
567 			plen = sip->si_pnamelen;
568 			if (tpname + plen - pname > nlen) {
569 				mutex_exit(&swapinfo_lock);
570 				kmem_free(pname, nlen);
571 				goto retry;
572 			}
573 			*tsip = *sip;
574 			tsip->si_pname = tpname;
575 			(void) strcpy(tsip->si_pname, sip->si_pname);
576 		}
577 		mutex_exit(&swapinfo_lock);
578 
579 		if (sip) {
580 			error = ENOMEM;
581 			goto lout;
582 		}
583 		ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
584 		for (tsip = csip, cnt = 0; cnt < nswap;  tsip++, ust++, cnt++) {
585 			if (copyin(ust, &st, sizeof (swapent_t)) != 0) {
586 				error = EFAULT;
587 				goto lout;
588 			}
589 			st.ste_flags = tsip->si_flags;
590 			st.ste_length =
591 			    (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
592 			st.ste_start = tsip->si_soff >> SCTRSHFT;
593 			st.ste_pages = tsip->si_npgs;
594 			st.ste_free = tsip->si_nfpgs;
595 			if (copyout(&st, ust, sizeof (swapent_t)) != 0) {
596 				error = EFAULT;
597 				goto lout;
598 			}
599 			if (!tsip->si_pnamelen)
600 				continue;
601 			if (copyout(tsip->si_pname, st.ste_path,
602 				tsip->si_pnamelen) != 0) {
603 				error = EFAULT;
604 				goto lout;
605 			}
606 		}
607 		*rv = nswap;
608 lout:
609 		kmem_free(csip, tmp_nswapfiles * sizeof (struct swapinfo));
610 		kmem_free(pname, nlen);
611 		return (error);
612 
613 	case SC_ADD:
614 	case SC_REMOVE:
615 		break;
616 	default:
617 		return (EINVAL);
618 	}
619 	if ((error = secpolicy_swapctl(CRED())) != 0)
620 		return (error);
621 
622 	if (copyin(sc_arg, &sr, sizeof (swapres_t)))
623 		return (EFAULT);
624 
625 	/* Allocate the space to read in pathname */
626 	if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
627 		return (ENOMEM);
628 
629 	error = copyinstr(sr.sr_name, swapname, MAXPATHLEN, 0);
630 	if (error)
631 		goto out;
632 
633 	error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
634 	if (error) {
635 		if (sc_cmd == SC_ADD)
636 			goto out;
637 		/* see if we match by name */
638 		vp = swapdel_byname(swapname, (size_t)sr.sr_start);
639 		if (vp == NULL)
640 			goto out;
641 	}
642 
643 	if (vp->v_flag & (VNOMAP | VNOSWAP)) {
644 		VN_RELE(vp);
645 		error = ENOSYS;
646 		goto out;
647 	}
648 	switch (vp->v_type) {
649 	case VBLK:
650 		break;
651 
652 	case VREG:
653 		if (vp->v_vfsp && vn_is_readonly(vp))
654 			error = EROFS;
655 		else
656 			error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED());
657 		break;
658 
659 	case VDIR:
660 		error = EISDIR;
661 		break;
662 	default:
663 		error = ENOSYS;
664 		break;
665 	}
666 	if (error == 0) {
667 		if (sc_cmd == SC_REMOVE)
668 			error = swapdel(vp, sr.sr_start);
669 		else
670 			error = swapadd(vp, sr.sr_start,
671 					sr.sr_length, swapname);
672 	}
673 	VN_RELE(vp);
674 out:
675 	kmem_free(swapname, MAXPATHLEN);
676 	return (error);
677 }
678 
679 #if defined(_LP64) && defined(_SYSCALL32)
680 
681 int
682 swapctl32(int sc_cmd, void *sc_arg, int *rv)
683 {
684 	struct swapinfo *sip, *csip, *tsip;
685 	int error = 0;
686 	struct swapent32 st, *ust;
687 	struct swapres32 sr;
688 	struct vnode *vp;
689 	int cnt = 0;
690 	int tmp_nswapfiles;
691 	int nswap;
692 	int length, nlen;
693 	int gplen = 0, plen;
694 	char *swapname;
695 	char *pname;
696 	char *tpname;
697 	struct anoninfo32 ai;
698 	size_t s;
699 	spgcnt_t avail;
700 
701 	switch (sc_cmd) {
702 	case SC_GETNSWP:
703 		*rv = nswapfiles;
704 		return (0);
705 
706 	case SC_AINFO:
707 		/*
708 		 * Return anoninfo information with these changes:
709 		 * ani_max = maximum amount of swap space
710 		 *	(including potentially available physical memory)
711 		 * ani_free = amount of unallocated anonymous memory
712 		 *	(some of which might be reserved and including
713 		 *	 potentially available physical memory)
714 		 * ani_resv = amount of claimed (reserved) anonymous memory
715 		 */
716 		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
717 		s = (k_anoninfo.ani_max + k_anoninfo.ani_mem_resv) + avail;
718 		if (s > UINT32_MAX)
719 			return (EOVERFLOW);
720 		ai.ani_max = s;
721 
722 		s = k_anoninfo.ani_free + avail;
723 		if (s > UINT32_MAX)
724 			return (EOVERFLOW);
725 		ai.ani_free = s;
726 
727 		s = k_anoninfo.ani_phys_resv + k_anoninfo.ani_mem_resv;
728 		if (s > UINT32_MAX)
729 			return (EOVERFLOW);
730 		ai.ani_resv = s;
731 
732 		if (copyout(&ai, sc_arg, sizeof (ai)) != 0)
733 			return (EFAULT);
734 		return (0);
735 
736 	case SC_LIST:
737 		if (copyin(sc_arg, &length, sizeof (int32_t)) != 0)
738 			return (EFAULT);
739 beginning:
740 		tmp_nswapfiles = nswapfiles;
741 		/* Return an error if not enough space for the whole table. */
742 		if (length < tmp_nswapfiles)
743 			return (ENOMEM);
744 		/*
745 		 * Get memory to hold the swap entries and their names. We'll
746 		 * copy the real entries into these and then copy these out.
747 		 * Allocating the pathname memory is only a guess so we may
748 		 * find that we need more and have to do it again.
749 		 * All this is because we have to hold the anon lock while
750 		 * traversing the swapinfo list, and we can't be doing copyouts
751 		 * and/or kmem_alloc()s during this.
752 		 */
753 		csip = kmem_zalloc(tmp_nswapfiles * sizeof (*csip), KM_SLEEP);
754 retry:
755 		nlen = tmp_nswapfiles * (gplen += 100);
756 		pname = kmem_zalloc(nlen, KM_SLEEP);
757 
758 		mutex_enter(&swapinfo_lock);
759 
760 		if (tmp_nswapfiles != nswapfiles) {
761 			mutex_exit(&swapinfo_lock);
762 			kmem_free(pname, nlen);
763 			kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
764 			gplen = 0;
765 			goto beginning;
766 		}
767 		for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
768 		    (sip != NULL) && (nswap < tmp_nswapfiles);
769 		    sip = sip->si_next, tsip++, tpname += plen, nswap++) {
770 			plen = sip->si_pnamelen;
771 			if (tpname + plen - pname > nlen) {
772 				mutex_exit(&swapinfo_lock);
773 				kmem_free(pname, nlen);
774 				goto retry;
775 			}
776 			*tsip = *sip;
777 			tsip->si_pname = tpname;
778 			(void) strcpy(tsip->si_pname, sip->si_pname);
779 		}
780 		mutex_exit(&swapinfo_lock);
781 
782 		if (sip != NULL) {
783 			error = ENOMEM;
784 			goto lout;
785 		}
786 		ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent;
787 		for (tsip = csip, cnt = 0; cnt < nswap;  tsip++, ust++, cnt++) {
788 			if (copyin(ust, &st, sizeof (*ust)) != 0) {
789 				error = EFAULT;
790 				goto lout;
791 			}
792 			st.ste_flags = tsip->si_flags;
793 			st.ste_length =
794 			    (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
795 			st.ste_start = tsip->si_soff >> SCTRSHFT;
796 			st.ste_pages = tsip->si_npgs;
797 			st.ste_free = tsip->si_nfpgs;
798 			if (copyout(&st, ust, sizeof (st)) != 0) {
799 				error = EFAULT;
800 				goto lout;
801 			}
802 			if (!tsip->si_pnamelen)
803 				continue;
804 			if (copyout(tsip->si_pname,
805 			    (caddr_t)(uintptr_t)st.ste_path,
806 			    tsip->si_pnamelen) != 0) {
807 				error = EFAULT;
808 				goto lout;
809 			}
810 		}
811 		*rv = nswap;
812 lout:
813 		kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
814 		kmem_free(pname, nlen);
815 		return (error);
816 
817 	case SC_ADD:
818 	case SC_REMOVE:
819 		break;
820 	default:
821 		return (EINVAL);
822 	}
823 	if ((error = secpolicy_swapctl(CRED())) != 0)
824 		return (error);
825 
826 	if (copyin(sc_arg, &sr, sizeof (sr)))
827 		return (EFAULT);
828 
829 	/* Allocate the space to read in pathname */
830 	if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
831 		return (ENOMEM);
832 
833 	error = copyinstr((caddr_t)(uintptr_t)sr.sr_name,
834 	    swapname, MAXPATHLEN, NULL);
835 	if (error)
836 		goto out;
837 
838 	error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
839 	if (error) {
840 		if (sc_cmd == SC_ADD)
841 			goto out;
842 		/* see if we match by name */
843 		vp = swapdel_byname(swapname, (uint_t)sr.sr_start);
844 		if (vp == NULL)
845 			goto out;
846 	}
847 
848 	if (vp->v_flag & (VNOMAP | VNOSWAP)) {
849 		VN_RELE(vp);
850 		error = ENOSYS;
851 		goto out;
852 	}
853 	switch (vp->v_type) {
854 	case VBLK:
855 		break;
856 
857 	case VREG:
858 		if (vp->v_vfsp && vn_is_readonly(vp))
859 			error = EROFS;
860 		else
861 			error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED());
862 		break;
863 
864 	case VDIR:
865 		error = EISDIR;
866 		break;
867 	default:
868 		error = ENOSYS;
869 		break;
870 	}
871 	if (error == 0) {
872 		if (sc_cmd == SC_REMOVE)
873 			error = swapdel(vp, sr.sr_start);
874 		else
875 			error = swapadd(vp, sr.sr_start, sr.sr_length,
876 			    swapname);
877 	}
878 	VN_RELE(vp);
879 out:
880 	kmem_free(swapname, MAXPATHLEN);
881 	return (error);
882 }
883 
884 #endif /* _LP64 && _SYSCALL32 */
885 
886 /*
887  * Add a new swap file.
888  */
889 int
890 swapadd(struct vnode *vp, ulong_t lowblk, ulong_t nblks, char *swapname)
891 {
892 	struct swapinfo **sipp, *nsip = NULL, *esip = NULL;
893 	struct vnode *cvp;
894 	struct vattr vattr;
895 	pgcnt_t pages;
896 	u_offset_t soff, eoff;
897 	int error;
898 	ssize_t i, start, end;
899 	ushort_t wasswap;
900 	ulong_t startblk;
901 	size_t	returned_mem;
902 
903 	SWAP_PRINT(SW_CTL, "swapadd: vp %p lowblk %ld nblks %ld swapname %s\n",
904 	    vp, lowblk, nblks, swapname, 0);
905 	/*
906 	 * Get the real vnode. (If vp is not a specnode it just returns vp, so
907 	 * it does the right thing, but having this code know about specnodes
908 	 * violates the spirit of having it be indepedent of vnode type.)
909 	 */
910 	cvp = common_specvp(vp);
911 
912 	/*
913 	 * Or in VISSWAP so file system has chance to deny swap-ons during open.
914 	 */
915 	mutex_enter(&cvp->v_lock);
916 	wasswap = cvp->v_flag & VISSWAP;
917 	cvp->v_flag |= VISSWAP;
918 	mutex_exit(&cvp->v_lock);
919 
920 	mutex_enter(&swap_lock);
921 	if (error = VOP_OPEN(&cvp, FREAD|FWRITE, CRED())) {
922 		mutex_exit(&swap_lock);
923 		/* restore state of v_flag */
924 		if (!wasswap) {
925 			mutex_enter(&cvp->v_lock);
926 			cvp->v_flag &= ~VISSWAP;
927 			mutex_exit(&cvp->v_lock);
928 		}
929 		return (error);
930 	}
931 	mutex_exit(&swap_lock);
932 
933 	/*
934 	 * Get partition size. Return error if empty partition,
935 	 * or if request does not fit within the partition.
936 	 * If this is the first swap device, we can reduce
937 	 * the size of the swap area to match what is
938 	 * available.  This can happen if the system was built
939 	 * on a machine with a different size swap partition.
940 	 */
941 	vattr.va_mask = AT_SIZE;
942 	if (error = VOP_GETATTR(cvp, &vattr, ATTR_COMM, CRED()))
943 		goto out;
944 
945 	/*
946 	 * Specfs returns a va_size of MAXOFFSET_T (UNKNOWN_SIZE) when the
947 	 * size of the device can't be determined.
948 	 */
949 	if ((vattr.va_size == 0) || (vattr.va_size == MAXOFFSET_T)) {
950 		error = EINVAL;
951 		goto out;
952 	}
953 
954 #ifdef	_ILP32
955 	/*
956 	 * No support for large swap in 32-bit OS, if the size of the swap is
957 	 * bigger than MAXOFF32_T then the size used by swapfs must be limited.
958 	 * This limitation is imposed by the swap subsystem itself, a D_64BIT
959 	 * driver as the target of swap operation should be able to field
960 	 * the IO.
961 	 */
962 	if (vattr.va_size > MAXOFF32_T) {
963 		cmn_err(CE_NOTE,
964 			"!swap device %s truncated from 0x%llx to 0x%x bytes",
965 			swapname, vattr.va_size, MAXOFF32_T);
966 		vattr.va_size = MAXOFF32_T;
967 	}
968 #endif	/* _ILP32 */
969 
970 	/* Fail if file not writeable (try to set size to current size) */
971 	vattr.va_mask = AT_SIZE;
972 	if (error = VOP_SETATTR(cvp, &vattr, 0, CRED(), NULL))
973 		goto out;
974 
975 	/* Fail if fs does not support VOP_PAGEIO */
976 	error = VOP_PAGEIO(cvp, (page_t *)NULL, (u_offset_t)0, 0, 0, CRED());
977 
978 	if (error == ENOSYS)
979 		goto out;
980 	else
981 		error = 0;
982 	/*
983 	 * If swapping on the root filesystem don't put swap blocks that
984 	 * correspond to the miniroot filesystem on the swap free list.
985 	 */
986 	if (cvp == rootdir)
987 		startblk = roundup(MINIROOTSIZE<<SCTRSHFT, klustsize)>>SCTRSHFT;
988 	else				/* Skip 1st page (disk label) */
989 		startblk = (ulong_t)(lowblk ? lowblk : 1);
990 
991 	soff = startblk << SCTRSHFT;
992 	if (soff >= vattr.va_size) {
993 		error = EINVAL;
994 		goto out;
995 	}
996 
997 	/*
998 	 * If user specified 0 blks, use the size of the device
999 	 */
1000 	eoff = nblks ?  soff + (nblks - (startblk - lowblk) << SCTRSHFT) :
1001 			vattr.va_size;
1002 
1003 	SWAP_PRINT(SW_CTL, "swapadd: va_size %ld soff %ld eoff %ld\n",
1004 	    vattr.va_size, soff, eoff, 0, 0);
1005 
1006 	if (eoff > vattr.va_size) {
1007 		error = EINVAL;
1008 		goto out;
1009 	}
1010 
1011 	/*
1012 	 * The starting and ending offsets must be page aligned.
1013 	 * Round soff up to next page boundary, round eoff
1014 	 * down to previous page boundary.
1015 	 */
1016 	soff = ptob(btopr(soff));
1017 	eoff = ptob(btop(eoff));
1018 	if (soff >= eoff) {
1019 		SWAP_PRINT(SW_CTL, "swapadd: soff %ld >= eoff %ld\n",
1020 		    soff, eoff, 0, 0, 0);
1021 		error = EINVAL;
1022 		goto out;
1023 	}
1024 
1025 	pages = btop(eoff - soff);
1026 
1027 	/* Allocate and partially set up the new swapinfo */
1028 	nsip = kmem_zalloc(sizeof (struct swapinfo), KM_SLEEP);
1029 	nsip->si_vp = cvp;
1030 
1031 	nsip->si_soff = soff;
1032 	nsip->si_eoff = eoff;
1033 	nsip->si_hint = 0;
1034 	nsip->si_checkcnt = nsip->si_alloccnt = 0;
1035 
1036 	nsip->si_pnamelen = (int)strlen(swapname) + 1;
1037 	nsip->si_pname = (char *)kmem_zalloc(nsip->si_pnamelen, KM_SLEEP);
1038 	bcopy(swapname, nsip->si_pname, nsip->si_pnamelen - 1);
1039 	SWAP_PRINT(SW_CTL, "swapadd: allocating swapinfo for %s, %ld pages\n",
1040 	    swapname, pages, 0, 0, 0);
1041 	/*
1042 	 * Size of swapslots map in bytes
1043 	 */
1044 	nsip->si_mapsize = P2ROUNDUP(pages, NBBW) / NBBY;
1045 	nsip->si_swapslots = kmem_zalloc(nsip->si_mapsize, KM_SLEEP);
1046 
1047 	/*
1048 	 * Permanently set the bits that can't ever be allocated,
1049 	 * i.e. those from the ending offset to the round up slot for the
1050 	 * swapslots bit map.
1051 	 */
1052 	start = pages;
1053 	end = P2ROUNDUP(pages, NBBW);
1054 	for (i = start; i < end; i++) {
1055 		SWAP_PRINT(SW_CTL, "swapadd: set bit for page %ld\n", i,
1056 		    0, 0, 0, 0);
1057 		SETBIT(nsip->si_swapslots, i);
1058 	}
1059 	nsip->si_npgs = nsip->si_nfpgs = pages;
1060 	/*
1061 	 * Now check to see if we can add it. We wait til now to check because
1062 	 * we need the swapinfo_lock and we don't want sleep with it (e.g.,
1063 	 * during kmem_alloc()) while we're setting up the swapinfo.
1064 	 */
1065 	mutex_enter(&swapinfo_lock);
1066 	for (sipp = &swapinfo; (esip = *sipp) != NULL; sipp = &esip->si_next) {
1067 		if (esip->si_vp == cvp) {
1068 			if (esip->si_soff == soff && esip->si_npgs == pages &&
1069 			    (esip->si_flags & ST_DOINGDEL)) {
1070 				/*
1071 				 * We are adding a device that we are in the
1072 				 * middle of deleting. Just clear the
1073 				 * ST_DOINGDEL flag to signal this and
1074 				 * the deletion routine will eventually notice
1075 				 * it and add it back.
1076 				 */
1077 				esip->si_flags &= ~ST_DOINGDEL;
1078 				mutex_exit(&swapinfo_lock);
1079 				goto out;
1080 			}
1081 			/* disallow overlapping swap files */
1082 			if ((soff < esip->si_eoff) && (eoff > esip->si_soff)) {
1083 				error = EEXIST;
1084 				mutex_exit(&swapinfo_lock);
1085 				goto out;
1086 			}
1087 		}
1088 	}
1089 
1090 	nswapfiles++;
1091 
1092 	/*
1093 	 * add new swap device to list and shift allocations to it
1094 	 * before updating the anoninfo counters
1095 	 */
1096 	*sipp = nsip;
1097 	silast = nsip;
1098 
1099 	/*
1100 	 * Update the total amount of reservable swap space
1101 	 * accounting properly for swap space from physical memory
1102 	 */
1103 	/* New swap device soaks up currently reserved memory swap */
1104 	mutex_enter(&anoninfo_lock);
1105 
1106 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
1107 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
1108 
1109 	k_anoninfo.ani_max += pages;
1110 	ANI_ADD(pages);
1111 	if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
1112 		returned_mem = MIN(k_anoninfo.ani_mem_resv -
1113 		    k_anoninfo.ani_locked_swap,
1114 		    k_anoninfo.ani_max - k_anoninfo.ani_phys_resv);
1115 
1116 		ANI_ADD(-returned_mem);
1117 		k_anoninfo.ani_free -= returned_mem;
1118 		k_anoninfo.ani_mem_resv -= returned_mem;
1119 		k_anoninfo.ani_phys_resv += returned_mem;
1120 
1121 		mutex_enter(&freemem_lock);
1122 		availrmem += returned_mem;
1123 		mutex_exit(&freemem_lock);
1124 	}
1125 	/*
1126 	 * At boot time, to permit booting small memory machines using
1127 	 * only physical memory as swap space, we allowed a dangerously
1128 	 * large amount of memory to be used as swap space; now that
1129 	 * more physical backing store is available bump down the amount
1130 	 * we can get from memory to a safer size.
1131 	 */
1132 	if (swapfs_minfree < swapfs_desfree) {
1133 		mutex_enter(&freemem_lock);
1134 		if (availrmem > swapfs_desfree || !k_anoninfo.ani_mem_resv)
1135 			swapfs_minfree = swapfs_desfree;
1136 		mutex_exit(&freemem_lock);
1137 	}
1138 
1139 	SWAP_PRINT(SW_CTL, "swapadd: ani_max %ld ani_free %ld\n",
1140 	    k_anoninfo.ani_free, k_anoninfo.ani_free, 0, 0, 0);
1141 
1142 	mutex_exit(&anoninfo_lock);
1143 
1144 	mutex_exit(&swapinfo_lock);
1145 
1146 	/* Initialize the dump device */
1147 	mutex_enter(&dump_lock);
1148 	if (dumpvp == NULL)
1149 		(void) dumpinit(vp, swapname, 0);
1150 	mutex_exit(&dump_lock);
1151 
1152 	VN_HOLD(cvp);
1153 out:
1154 	if (error || esip) {
1155 		SWAP_PRINT(SW_CTL, "swapadd: error (%d)\n", error, 0, 0, 0, 0);
1156 
1157 		if (!wasswap) {
1158 			mutex_enter(&cvp->v_lock);
1159 			cvp->v_flag &= ~VISSWAP;
1160 			mutex_exit(&cvp->v_lock);
1161 		}
1162 		if (nsip) {
1163 			kmem_free(nsip->si_swapslots, (size_t)nsip->si_mapsize);
1164 			kmem_free(nsip->si_pname, nsip->si_pnamelen);
1165 			kmem_free(nsip, sizeof (*nsip));
1166 		}
1167 		mutex_enter(&swap_lock);
1168 		(void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED());
1169 		mutex_exit(&swap_lock);
1170 	}
1171 	return (error);
1172 }
1173 
1174 /*
1175  * Delete a swap file.
1176  */
1177 static int
1178 swapdel(
1179 	struct vnode *vp,
1180 	ulong_t lowblk) /* Low block number of area to delete. */
1181 {
1182 	struct swapinfo **sipp, *osip = NULL;
1183 	struct vnode *cvp;
1184 	u_offset_t soff;
1185 	int error = 0;
1186 	u_offset_t toff = 0;
1187 	struct vnode *tvp = NULL;
1188 	spgcnt_t pages;
1189 	struct anon **app, *ap;
1190 	kmutex_t *ahm;
1191 	pgcnt_t adjust_swap = 0;
1192 
1193 	/* Find the swap file entry for the file to be deleted */
1194 	cvp = common_specvp(vp);
1195 
1196 
1197 	lowblk = lowblk ? lowblk : 1; 	/* Skip first page (disk label) */
1198 	soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
1199 
1200 	mutex_enter(&swapinfo_lock);
1201 	for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
1202 		if ((osip->si_vp == cvp) &&
1203 		    (osip->si_soff == soff) && (osip->si_flags == 0))
1204 			break;
1205 	}
1206 
1207 	/* If the file was not found, error.  */
1208 	if (osip == NULL) {
1209 		error = EINVAL;
1210 		mutex_exit(&swapinfo_lock);
1211 		goto out;
1212 	}
1213 
1214 	pages = osip->si_npgs;
1215 
1216 	/*
1217 	 * Do not delete if we will be low on swap pages.
1218 	 */
1219 	mutex_enter(&anoninfo_lock);
1220 
1221 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
1222 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
1223 
1224 	mutex_enter(&freemem_lock);
1225 	if (((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) +
1226 	    MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) < pages) {
1227 		mutex_exit(&freemem_lock);
1228 		mutex_exit(&anoninfo_lock);
1229 		error = ENOMEM;
1230 		cmn_err(CE_WARN, "swapdel - too few free pages");
1231 		mutex_exit(&swapinfo_lock);
1232 		goto out;
1233 	}
1234 	mutex_exit(&freemem_lock);
1235 
1236 	k_anoninfo.ani_max -= pages;
1237 
1238 	/* If needed, reserve memory swap to replace old device */
1239 	if (k_anoninfo.ani_phys_resv > k_anoninfo.ani_max) {
1240 		adjust_swap = k_anoninfo.ani_phys_resv - k_anoninfo.ani_max;
1241 		k_anoninfo.ani_phys_resv -= adjust_swap;
1242 		k_anoninfo.ani_mem_resv += adjust_swap;
1243 		mutex_enter(&freemem_lock);
1244 		availrmem -= adjust_swap;
1245 		mutex_exit(&freemem_lock);
1246 		ANI_ADD(adjust_swap);
1247 	}
1248 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
1249 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
1250 	mutex_exit(&anoninfo_lock);
1251 
1252 	ANI_ADD(-pages);
1253 
1254 	/*
1255 	 * Set the delete flag.  This prevents anyone from allocating more
1256 	 * pages from this file. Also set ST_DOINGDEL. Someone who wants to
1257 	 * add the file back while we're deleting it will signify by clearing
1258 	 * this flag.
1259 	 */
1260 	osip->si_flags |= ST_INDEL|ST_DOINGDEL;
1261 	mutex_exit(&swapinfo_lock);
1262 
1263 	/*
1264 	 * Free all the allocated physical slots for this file. We do this
1265 	 * by walking through the entire anon hash array, because we need
1266 	 * to update all the anon slots that have physical swap slots on
1267 	 * this file, and this is the only way to find them all. We go back
1268 	 * to the beginning of a bucket after each slot is freed because the
1269 	 * anonhash_lock is not held during the free and thus the hash table
1270 	 * may change under us.
1271 	 */
1272 	for (app = anon_hash; app < &anon_hash[ANON_HASH_SIZE]; app++) {
1273 		ahm = &anonhash_lock[(app-anon_hash) & (AH_LOCK_SIZE - 1)];
1274 		mutex_enter(ahm);
1275 top:
1276 		for (ap = *app; ap != NULL; ap = ap->an_hash) {
1277 			if (ap->an_pvp == cvp &&
1278 			    ap->an_poff >= osip->si_soff &&
1279 			    ap->an_poff < osip->si_eoff) {
1280 				ASSERT(TESTBIT(osip->si_swapslots,
1281 				    btop((size_t)(ap->an_poff -
1282 				    osip->si_soff))));
1283 				tvp = ap->an_vp;
1284 				toff = ap->an_off;
1285 				VN_HOLD(tvp);
1286 				mutex_exit(ahm);
1287 
1288 				error = swapslot_free(tvp, toff, osip);
1289 
1290 				VN_RELE(tvp);
1291 				mutex_enter(ahm);
1292 				if (!error && (osip->si_flags & ST_DOINGDEL)) {
1293 					goto top;
1294 				} else {
1295 					if (error) {
1296 						cmn_err(CE_WARN,
1297 						    "swapslot_free failed %d",
1298 						    error);
1299 					}
1300 
1301 					/*
1302 					 * Add device back before making it
1303 					 * visible.
1304 					 */
1305 					mutex_enter(&swapinfo_lock);
1306 					osip->si_flags &=
1307 					    ~(ST_INDEL | ST_DOINGDEL);
1308 					mutex_exit(&swapinfo_lock);
1309 
1310 					/*
1311 					 * Update the anon space available
1312 					 */
1313 					mutex_enter(&anoninfo_lock);
1314 
1315 					k_anoninfo.ani_phys_resv += adjust_swap;
1316 					k_anoninfo.ani_mem_resv -= adjust_swap;
1317 					k_anoninfo.ani_max += pages;
1318 
1319 					mutex_enter(&freemem_lock);
1320 					availrmem += adjust_swap;
1321 					mutex_exit(&freemem_lock);
1322 
1323 					mutex_exit(&anoninfo_lock);
1324 
1325 					ANI_ADD(pages);
1326 
1327 					mutex_exit(ahm);
1328 					goto out;
1329 				}
1330 			}
1331 		}
1332 		mutex_exit(ahm);
1333 	}
1334 
1335 	/* All done, they'd better all be free! */
1336 	mutex_enter(&swapinfo_lock);
1337 	ASSERT(osip->si_nfpgs == osip->si_npgs);
1338 
1339 	/* Now remove it from the swapinfo list */
1340 	for (sipp = &swapinfo; *sipp != NULL; sipp = &(*sipp)->si_next) {
1341 		if (*sipp == osip)
1342 			break;
1343 	}
1344 	ASSERT(*sipp);
1345 	*sipp = osip->si_next;
1346 	if (silast == osip)
1347 		if ((silast = osip->si_next) == NULL)
1348 			silast = swapinfo;
1349 	nswapfiles--;
1350 	mutex_exit(&swapinfo_lock);
1351 
1352 	kmem_free(osip->si_swapslots, osip->si_mapsize);
1353 	kmem_free(osip->si_pname, osip->si_pnamelen);
1354 	kmem_free(osip, sizeof (*osip));
1355 
1356 	mutex_enter(&dump_lock);
1357 	if (cvp == dumpvp)
1358 		dumpfini();
1359 	mutex_exit(&dump_lock);
1360 
1361 	/* Release the vnode */
1362 
1363 	mutex_enter(&swap_lock);
1364 	(void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED());
1365 	mutex_enter(&cvp->v_lock);
1366 	cvp->v_flag &= ~VISSWAP;
1367 	mutex_exit(&cvp->v_lock);
1368 	VN_RELE(cvp);
1369 	mutex_exit(&swap_lock);
1370 out:
1371 	return (error);
1372 }
1373 
1374 /*
1375  * Free up a physical swap slot on swapinfo sip, currently in use by the
1376  * anonymous page whose name is (vp, off).
1377  */
1378 static int
1379 swapslot_free(
1380 	struct vnode *vp,
1381 	u_offset_t off,
1382 	struct swapinfo *sip)
1383 {
1384 	struct page *pl[2], *pp;
1385 	struct anon *ap = NULL;
1386 	int error = 0;
1387 	kmutex_t *ahm;
1388 
1389 	/*
1390 	 * Get the page for the old swap slot and i/o lock it.
1391 	 * Users of the physical slot will synchronize on the i/o lock.
1392 	 */
1393 	if (error = VOP_GETPAGE(vp, (offset_t)off, ptob(1), NULL,
1394 	    pl, ptob(1), segkmap, NULL, S_READ, CRED())) {
1395 		/*
1396 		 * Anon slot went away (EIDRM) or vp was truncated (EFAULT)
1397 		 * while we got the page. Thus the physical slot must be
1398 		 * free, so we have succeeded.
1399 		 */
1400 		if (error == EIDRM || error == EFAULT)
1401 			error = 0;
1402 		return (error);
1403 	}
1404 	pp = pl[0];
1405 	page_io_lock(pp);
1406 
1407 	ahm = &anonhash_lock[AH_LOCK(vp, off)];
1408 	mutex_enter(ahm);
1409 	/*
1410 	 * Get the anon slot; anon struct cannot vanish while we hold
1411 	 * SE_SHARED lock on the physical page since anon_decref() blocks
1412 	 * in page_lookup() before it can proceed further to remove
1413 	 * anon struct from anon_hash table.
1414 	 */
1415 	if ((ap = swap_anon(vp, off)) == NULL) {
1416 		panic("swapslot_free(%p, %llx, %p), page: %p, null anon",
1417 			vp, off, sip, pp);
1418 	}
1419 	/*
1420 	 * Free the physical slot. It may have been freed up and replaced with
1421 	 * another one while we were getting the page so we have to re-verify
1422 	 * that this is really one we want. If we do free the slot we have
1423 	 * to mark the page modified, as its backing store is now gone.
1424 	 */
1425 	if (ap->an_pvp == sip->si_vp && ap->an_poff >= sip->si_soff &&
1426 	    ap->an_poff < sip->si_eoff) {
1427 		swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1428 		ap->an_pvp = NULL;
1429 		ap->an_poff = NULL;
1430 		mutex_exit(ahm);
1431 		hat_setmod(pp);
1432 	} else {
1433 		mutex_exit(ahm);
1434 	}
1435 out:
1436 	/* Release the page locks */
1437 	page_unlock(pp);
1438 	page_io_unlock(pp);
1439 	return (error);
1440 }
1441 
1442 /*
1443  * Get contig physical backing store for vp, in the range
1444  * [*offp, *offp + *lenp), May back a subrange of this, but must
1445  * always include the requested offset or fail. Returns the offsets
1446  * backed as [*offp, *offp + *lenp) and the physical offsets used to
1447  * back them from *pvpp in the range [*pstartp, *pstartp + *lenp).
1448  * Returns 	0 for success
1449  * 		SE_NOANON -- no anon slot for requested paged
1450  *		SE_NOSWAP -- no physical swap space available
1451  */
1452 int
1453 swap_newphysname(
1454 	struct vnode *vp,
1455 	u_offset_t offset,
1456 	u_offset_t *offp,
1457 	size_t *lenp,
1458 	struct vnode **pvpp,
1459 	u_offset_t *poffp)
1460 {
1461 	struct anon *ap = NULL;		/* anon slot for vp, off */
1462 	int error = 0;
1463 	struct vnode *pvp;
1464 	u_offset_t poff, pstart, prem;
1465 	size_t plen;
1466 	u_offset_t off, start;
1467 	kmutex_t *ahm;
1468 
1469 	ASSERT(*offp <= offset && offset < *offp + *lenp);
1470 
1471 	/* Get new physical swap slots. */
1472 	plen = *lenp;
1473 	if (!swap_phys_alloc(&pvp, &pstart, &plen, 0)) {
1474 		/*
1475 		 * No swap available so return error unless requested
1476 		 * offset is already backed in which case return that.
1477 		 */
1478 		ahm = &anonhash_lock[AH_LOCK(vp, offset)];
1479 		mutex_enter(ahm);
1480 		if ((ap = swap_anon(vp, offset)) == NULL) {
1481 			error = SE_NOANON;
1482 			mutex_exit(ahm);
1483 			return (error);
1484 		}
1485 		error = (ap->an_pvp ? 0 : SE_NOSWAP);
1486 		*offp = offset;
1487 		*lenp = PAGESIZE;
1488 		*pvpp = ap->an_pvp;
1489 		*poffp = ap->an_poff;
1490 		mutex_exit(ahm);
1491 		return (error);
1492 	}
1493 
1494 	/*
1495 	 * We got plen (<= *lenp) contig slots. Use these to back a
1496 	 * subrange of [*offp, *offp + *lenp) which includes offset.
1497 	 * For now we just put offset at the end of the kluster.
1498 	 * Clearly there are other possible choices - which is best?
1499 	 */
1500 	start = MAX(*offp,
1501 	    (offset + PAGESIZE > plen) ? (offset + PAGESIZE - plen) : 0);
1502 	ASSERT(start + plen <= *offp + *lenp);
1503 
1504 	for (off = start, poff = pstart; poff < pstart + plen;
1505 	    off += PAGESIZE, poff += PAGESIZE) {
1506 		ahm = &anonhash_lock[AH_LOCK(vp, off)];
1507 		mutex_enter(ahm);
1508 		if ((ap = swap_anon(vp, off)) != NULL) {
1509 			/* Free old slot if any, and assign new one */
1510 			if (ap->an_pvp)
1511 				swap_phys_free(ap->an_pvp, ap->an_poff,
1512 				    PAGESIZE);
1513 			ap->an_pvp = pvp;
1514 			ap->an_poff = poff;
1515 		} else {	/* No anon slot for a klustered page, quit. */
1516 			prem = (pstart + plen) - poff;
1517 			/* Already did requested page, do partial kluster */
1518 			if (off > offset) {
1519 				plen = poff - pstart;
1520 				error = 0;
1521 			/* Fail on requested page, error */
1522 			} else if (off == offset)  {
1523 				error = SE_NOANON;
1524 			/* Fail on prior page, fail on requested page, error */
1525 			} else if ((ap = swap_anon(vp, offset)) == NULL) {
1526 				error = SE_NOANON;
1527 			/* Fail on prior page, got requested page, do only it */
1528 			} else {
1529 				/* Free old slot if any, and assign new one */
1530 				if (ap->an_pvp)
1531 					swap_phys_free(ap->an_pvp, ap->an_poff,
1532 					    PAGESIZE);
1533 				ap->an_pvp = pvp;
1534 				ap->an_poff = poff;
1535 				/* One page kluster */
1536 				start = offset;
1537 				plen = PAGESIZE;
1538 				pstart = poff;
1539 				poff += PAGESIZE;
1540 				prem -= PAGESIZE;
1541 			}
1542 			/* Free unassigned slots */
1543 			swap_phys_free(pvp, poff, prem);
1544 			mutex_exit(ahm);
1545 			break;
1546 		}
1547 		mutex_exit(ahm);
1548 	}
1549 	ASSERT(*offp <= start && start + plen <= *offp + *lenp);
1550 	ASSERT(start <= offset && offset < start + plen);
1551 	*offp = start;
1552 	*lenp = plen;
1553 	*pvpp = pvp;
1554 	*poffp = pstart;
1555 	return (error);
1556 }
1557 
1558 
1559 /*
1560  * Get the physical swap backing store location for a given anonymous page
1561  * named (vp, off). The backing store name is returned in (*pvpp, *poffp).
1562  * Returns	0 		success
1563  *		EIDRM --	no anon slot (page is not allocated)
1564  */
1565 int
1566 swap_getphysname(
1567 	struct vnode *vp,
1568 	u_offset_t off,
1569 	struct vnode **pvpp,
1570 	u_offset_t *poffp)
1571 {
1572 	struct anon *ap;
1573 	int error = 0;
1574 	kmutex_t *ahm;
1575 
1576 	ahm = &anonhash_lock[AH_LOCK(vp, off)];
1577 	mutex_enter(ahm);
1578 
1579 	/* Get anon slot for vp, off */
1580 	ap = swap_anon(vp, off);
1581 	if (ap == NULL) {
1582 		error = EIDRM;
1583 		goto out;
1584 	}
1585 	*pvpp = ap->an_pvp;
1586 	*poffp = ap->an_poff;
1587 out:
1588 	mutex_exit(ahm);
1589 	return (error);
1590 }
1591