xref: /titanic_52/usr/src/uts/common/vm/vm_swap.c (revision b6c3f7863936abeae522e48a13887dddeb691a45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #pragma ident	"%Z%%M%	%I%	%E% SMI"
40 
41 /*
42  * Each physical swap area has an associated bitmap representing
43  * its physical storage. The bitmap records which swap slots are
44  * currently allocated or freed.  Allocation is done by searching
45  * through the bitmap for the first free slot. Thus, there's
46  * no linear relation between offset within the swap device and the
47  * address (within its segment(s)) of the page that the slot backs;
48  * instead, it's an arbitrary one-to-one mapping.
49  *
50  * Associated with each swap area is a swapinfo structure.  These
51  * structures are linked into a linear list that determines the
52  * ordering of swap areas in the logical swap device.  Each contains a
53  * pointer to the corresponding bitmap, the area's size, and its
54  * associated vnode.
55  */
56 
57 #include <sys/types.h>
58 #include <sys/inttypes.h>
59 #include <sys/param.h>
60 #include <sys/t_lock.h>
61 #include <sys/sysmacros.h>
62 #include <sys/systm.h>
63 #include <sys/errno.h>
64 #include <sys/kmem.h>
65 #include <sys/vfs.h>
66 #include <sys/vnode.h>
67 #include <sys/pathname.h>
68 #include <sys/cmn_err.h>
69 #include <sys/vtrace.h>
70 #include <sys/swap.h>
71 #include <sys/dumphdr.h>
72 #include <sys/debug.h>
73 #include <sys/fs/snode.h>
74 #include <sys/fs/swapnode.h>
75 #include <sys/policy.h>
76 #include <sys/zone.h>
77 
78 #include <vm/as.h>
79 #include <vm/seg.h>
80 #include <vm/page.h>
81 #include <vm/seg_vn.h>
82 #include <vm/hat.h>
83 #include <vm/anon.h>
84 #include <vm/seg_map.h>
85 
86 /*
87  * To balance the load among multiple swap areas, we don't allow
88  * more than swap_maxcontig allocations to be satisfied from a
89  * single swap area before moving on to the next swap area.  This
90  * effectively "interleaves" allocations among the many swap areas.
91  */
92 int swap_maxcontig;	/* set by anon_init() to 1 Mb */
93 
94 #define	MINIROOTSIZE	12000	/* ~6 Meg XXX */
95 
96 /*
97  * XXX - this lock is a kludge. It serializes some aspects of swapadd() and
98  * swapdel() (namely VOP_OPEN, VOP_CLOSE, VN_RELE).  It protects against
99  * somebody swapadd'ing and getting swap slots from a vnode, while someone
100  * else is in the process of closing or rele'ing it.
101  */
102 static kmutex_t swap_lock;
103 
104 kmutex_t swapinfo_lock;
105 
106 /*
107  * protected by the swapinfo_lock
108  */
109 struct swapinfo	*swapinfo;
110 
111 static	struct	swapinfo *silast;
112 static	int	nswapfiles;
113 
114 static u_offset_t	swap_getoff(struct swapinfo *);
115 static int	swapadd(struct vnode *, ulong_t, ulong_t, char *);
116 static int	swapdel(struct vnode *, ulong_t);
117 static int	swapslot_free(struct vnode *, u_offset_t, struct swapinfo *);
118 
119 /*
120  * swap device bitmap allocation macros
121  */
122 #define	MAPSHIFT	5
123 #define	NBBW		(NBPW * NBBY)	/* number of bits per word */
124 #define	TESTBIT(map, i)		(((map)[(i) >> MAPSHIFT] & (1 << (i) % NBBW)))
125 #define	SETBIT(map, i)		(((map)[(i) >> MAPSHIFT] |= (1 << (i) % NBBW)))
126 #define	CLEARBIT(map, i)	(((map)[(i) >> MAPSHIFT] &= ~(1 << (i) % NBBW)))
127 
128 int swap_debug = 0;	/* set for debug printf's */
129 int swap_verify = 0;	/* set to verify slots when freeing and allocating */
130 
131 uint_t swapalloc_maxcontig;
132 
133 /*
134  * Allocate a range of up to *lenp contiguous slots (page) from a physical
135  * swap device. Flags are one of:
136  *	SA_NOT  Must have a slot from a physical swap device other than the
137  * 		the one containing input (*vpp, *offp).
138  * Less slots than requested may be returned. *lenp allocated slots are
139  * returned starting at *offp on *vpp.
140  * Returns 1 for a successful allocation, 0 for couldn't allocate any slots.
141  */
142 int
143 swap_phys_alloc(
144 	struct vnode **vpp,
145 	u_offset_t *offp,
146 	size_t *lenp,
147 	uint_t flags)
148 {
149 	struct swapinfo *sip;
150 	offset_t soff, noff;
151 	size_t len;
152 
153 	mutex_enter(&swapinfo_lock);
154 	sip = silast;
155 
156 	/* Find a desirable physical device and allocate from it. */
157 	do {
158 		if (sip == NULL)
159 			break;
160 		if (!(sip->si_flags & ST_INDEL) &&
161 		    (spgcnt_t)sip->si_nfpgs > 0) {
162 			/* Caller wants other than specified swap device */
163 			if (flags & SA_NOT) {
164 				if (*vpp != sip->si_vp ||
165 				    *offp < sip->si_soff ||
166 				    *offp >= sip->si_eoff)
167 					goto found;
168 			/* Caller is loose, will take anything */
169 			} else
170 				goto found;
171 		} else if (sip->si_nfpgs == 0)
172 			sip->si_allocs = 0;
173 		if ((sip = sip->si_next) == NULL)
174 			sip = swapinfo;
175 	} while (sip != silast);
176 	mutex_exit(&swapinfo_lock);
177 	return (0);
178 found:
179 	soff = swap_getoff(sip);
180 	sip->si_nfpgs--;
181 	if (soff == -1)
182 		panic("swap_alloc: swap_getoff failed!");
183 
184 	for (len = PAGESIZE; len < *lenp; len += PAGESIZE) {
185 		if (sip->si_nfpgs == 0)
186 			break;
187 		if (swapalloc_maxcontig && len >= swapalloc_maxcontig)
188 			break;
189 		noff = swap_getoff(sip);
190 		if (noff == -1) {
191 			break;
192 		} else if (noff != soff + len) {
193 			CLEARBIT(sip->si_swapslots, btop(noff - sip->si_soff));
194 			break;
195 		}
196 		sip->si_nfpgs--;
197 	}
198 	*vpp = sip->si_vp;
199 	*offp = soff;
200 	*lenp = len;
201 	ASSERT((spgcnt_t)sip->si_nfpgs >= 0);
202 	sip->si_allocs += btop(len);
203 	if (sip->si_allocs >= swap_maxcontig) {
204 		sip->si_allocs = 0;
205 		if ((silast = sip->si_next) == NULL)
206 			silast = swapinfo;
207 	}
208 	TRACE_2(TR_FAC_VM, TR_SWAP_ALLOC,
209 	    "swap_alloc:sip %p offset %lx", sip, soff);
210 	mutex_exit(&swapinfo_lock);
211 	return (1);
212 }
213 
214 int swap_backsearch = 0;
215 
216 /*
217  * Get a free offset on swap device sip.
218  * Return >=0 offset if succeeded, -1 for failure.
219  */
220 static u_offset_t
221 swap_getoff(struct swapinfo *sip)
222 {
223 	uint_t *sp, *ep;
224 	size_t aoff, boff, poff, slotnumber;
225 
226 	ASSERT(MUTEX_HELD(&swapinfo_lock));
227 
228 	sip->si_alloccnt++;
229 	for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
230 	    ep = &sip->si_swapslots[sip->si_mapsize / NBPW]; sp < ep; sp++) {
231 		if (*sp != (uint_t)0xffffffff)
232 			goto foundentry;
233 		else
234 			sip->si_checkcnt++;
235 	}
236 	SWAP_PRINT(SW_ALLOC,
237 	    "swap_getoff: couldn't find slot from hint %ld to end\n",
238 	    sip->si_hint, 0, 0, 0, 0);
239 	/*
240 	 * Go backwards? Check for faster method XXX
241 	 */
242 	if (swap_backsearch) {
243 		for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
244 		    ep = sip->si_swapslots; sp > ep; sp--) {
245 			if (*sp != (uint_t)0xffffffff)
246 				goto foundentry;
247 			else
248 				sip->si_checkcnt++;
249 		}
250 	} else {
251 		for (sp = sip->si_swapslots,
252 		    ep = &sip->si_swapslots[sip->si_hint >> MAPSHIFT];
253 		    sp < ep; sp++) {
254 			if (*sp != (uint_t)0xffffffff)
255 				goto foundentry;
256 			else
257 				sip->si_checkcnt++;
258 		}
259 	}
260 	if (*sp == 0xffffffff) {
261 		cmn_err(CE_WARN, "No free swap slots!");
262 		return ((u_offset_t)-1);
263 	}
264 
265 foundentry:
266 	/*
267 	 * aoff is the page number offset (in bytes) of the si_swapslots
268 	 * array element containing a free page
269 	 *
270 	 * boff is the page number offset of the free page
271 	 * (i.e. cleared bit) in si_swapslots[aoff].
272 	 */
273 	aoff = ((char *)sp - (char *)sip->si_swapslots) * NBBY;
274 
275 	for (boff = (sip->si_hint % NBBW); boff < NBBW; boff++) {
276 		if (!TESTBIT(sip->si_swapslots, aoff + boff))
277 			goto foundslot;
278 		else
279 			sip->si_checkcnt++;
280 	}
281 	for (boff = 0; boff < (sip->si_hint % NBBW); boff++) {
282 		if (!TESTBIT(sip->si_swapslots, aoff + boff))
283 			goto foundslot;
284 		else
285 			sip->si_checkcnt++;
286 	}
287 	panic("swap_getoff: didn't find slot in word hint %ld", sip->si_hint);
288 
289 foundslot:
290 	/*
291 	 * Return the offset of the free page in swap device.
292 	 * Convert page number of byte offset and add starting
293 	 * offset of swap device.
294 	 */
295 	slotnumber = aoff + boff;
296 	SWAP_PRINT(SW_ALLOC, "swap_getoff: allocating slot %ld\n",
297 	    slotnumber, 0, 0, 0, 0);
298 	poff = ptob(slotnumber);
299 	if (poff + sip->si_soff >= sip->si_eoff)
300 		printf("ptob(aoff(%ld) + boff(%ld))(%ld) >= eoff(%ld)\n",
301 		    aoff, boff, ptob(slotnumber), (long)sip->si_eoff);
302 	ASSERT(poff < sip->si_eoff);
303 	/*
304 	 * We could verify here that the slot isn't already allocated
305 	 * by looking through all the anon slots.
306 	 */
307 	SETBIT(sip->si_swapslots, slotnumber);
308 	sip->si_hint = slotnumber + 1;	/* hint = next slot */
309 	return (poff + sip->si_soff);
310 }
311 
312 /*
313  * Free a swap page.
314  */
315 void
316 swap_phys_free(struct vnode *vp, u_offset_t off, size_t len)
317 {
318 	struct swapinfo *sip;
319 	ssize_t pagenumber, npage;
320 
321 	mutex_enter(&swapinfo_lock);
322 	sip = swapinfo;
323 
324 	do {
325 		if (sip->si_vp == vp &&
326 		    sip->si_soff <= off && off < sip->si_eoff) {
327 			for (pagenumber = btop(off - sip->si_soff),
328 			    npage = btop(len) + pagenumber;
329 			    pagenumber < npage; pagenumber++) {
330 				SWAP_PRINT(SW_ALLOC,
331 				    "swap_phys_free: freeing slot %ld on "
332 				    "sip %p\n",
333 				    pagenumber, sip, 0, 0, 0);
334 				if (!TESTBIT(sip->si_swapslots, pagenumber)) {
335 					panic(
336 					    "swap_phys_free: freeing free slot "
337 					    "%p,%lx\n", (void *)vp,
338 					    ptob(pagenumber) + sip->si_soff);
339 				}
340 				CLEARBIT(sip->si_swapslots, pagenumber);
341 				sip->si_nfpgs++;
342 			}
343 			ASSERT(sip->si_nfpgs <= sip->si_npgs);
344 			mutex_exit(&swapinfo_lock);
345 			return;
346 		}
347 	} while ((sip = sip->si_next) != NULL);
348 	panic("swap_phys_free");
349 	/*NOTREACHED*/
350 }
351 
352 /*
353  * Return the anon struct corresponding for the given
354  * <vnode, off> if it is part of the virtual swap device.
355  * Return the anon struct if found, otherwise NULL.
356  */
357 struct anon *
358 swap_anon(struct vnode *vp, u_offset_t off)
359 {
360 	struct anon *ap;
361 
362 	ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(vp, off)]));
363 
364 	for (ap = anon_hash[ANON_HASH(vp, off)]; ap != NULL; ap = ap->an_hash) {
365 		if (ap->an_vp == vp && ap->an_off == off)
366 			return (ap);
367 	}
368 	return (NULL);
369 }
370 
371 
372 /*
373  * Determine if the vp offset range overlap a swap device.
374  */
375 int
376 swap_in_range(struct vnode *vp, u_offset_t offset, size_t len)
377 {
378 	struct swapinfo *sip;
379 	u_offset_t eoff;
380 
381 	eoff = offset + len;
382 	ASSERT(eoff > offset);
383 
384 	mutex_enter(&swapinfo_lock);
385 	sip = swapinfo;
386 	if (vp && sip) {
387 		do {
388 			if (vp != sip->si_vp || eoff <= sip->si_soff ||
389 			    offset >= sip->si_eoff)
390 				continue;
391 			mutex_exit(&swapinfo_lock);
392 			return (1);
393 		} while ((sip = sip->si_next) != NULL);
394 	}
395 	mutex_exit(&swapinfo_lock);
396 	return (0);
397 }
398 
399 /*
400  * See if name is one of our swap files
401  * even though lookupname failed.
402  * This can be used by swapdel to delete
403  * swap resources on remote machines
404  * where the link has gone down.
405  */
406 static struct vnode *
407 swapdel_byname(
408 	char 	*name,			/* pathname to delete */
409 	ulong_t lowblk) 	/* Low block number of area to delete */
410 {
411 	struct swapinfo **sipp, *osip;
412 	u_offset_t soff;
413 
414 	/*
415 	 * Find the swap file entry for the file to
416 	 * be deleted. Skip any entries that are in
417 	 * transition.
418 	 */
419 
420 	soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
421 
422 	mutex_enter(&swapinfo_lock);
423 	for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
424 		if ((strcmp(osip->si_pname, name) == 0) &&
425 		    (osip->si_soff == soff) && (osip->si_flags == 0)) {
426 			struct vnode *vp = osip->si_vp;
427 
428 			VN_HOLD(vp);
429 			mutex_exit(&swapinfo_lock);
430 			return (vp);
431 		}
432 	}
433 	mutex_exit(&swapinfo_lock);
434 	return (NULL);
435 }
436 
437 
438 /*
439  * New system call to manipulate swap files.
440  */
441 int
442 swapctl(int sc_cmd, void *sc_arg, int *rv)
443 {
444 	struct swapinfo *sip, *csip, *tsip;
445 	int error = 0;
446 	struct swapent st, *ust;
447 	struct swapres sr;
448 	struct vnode *vp;
449 	int cnt = 0;
450 	int tmp_nswapfiles;
451 	int nswap;
452 	int length, nlen;
453 	int gplen = 0, plen;
454 	char *swapname;
455 	char *pname;
456 	char *tpname;
457 	struct anoninfo ai;
458 	spgcnt_t avail;
459 	int global = INGLOBALZONE(curproc);
460 
461 	/*
462 	 * When running in a zone we want to hide the details of the swap
463 	 * devices: we report there only being one swap device named "swap"
464 	 * having a size equal to the sum of the sizes of all real swap devices
465 	 * on the system.
466 	 */
467 	switch (sc_cmd) {
468 	case SC_GETNSWP:
469 		if (global)
470 			*rv = nswapfiles;
471 		else
472 			*rv = 1;
473 		return (0);
474 
475 	case SC_AINFO:
476 		/*
477 		 * Return anoninfo information with these changes:
478 		 * ani_max = maximum amount of swap space
479 		 *	(including potentially available physical memory)
480 		 * ani_free = amount of unallocated anonymous memory
481 		 *	(some of which might be reserved and including
482 		 *	 potentially available physical memory)
483 		 * ani_resv = amount of claimed (reserved) anonymous memory
484 		 */
485 		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
486 		ai.ani_max = (k_anoninfo.ani_max +
487 		    k_anoninfo.ani_mem_resv) +avail;
488 
489 		ai.ani_free = k_anoninfo.ani_free + avail;
490 
491 		ai.ani_resv = k_anoninfo.ani_phys_resv +
492 		    k_anoninfo.ani_mem_resv;
493 
494 		if (copyout(&ai, sc_arg, sizeof (struct anoninfo)) != 0)
495 			return (EFAULT);
496 		return (0);
497 
498 	case SC_LIST:
499 		if (copyin(sc_arg, &length, sizeof (int)) != 0)
500 			return (EFAULT);
501 		if (!global) {
502 			struct swapent st;
503 			char *swappath = "swap";
504 
505 			if (length < 1)
506 				return (ENOMEM);
507 			ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
508 			if (copyin(ust, &st, sizeof (swapent_t)) != 0)
509 				return (EFAULT);
510 			st.ste_start = PAGESIZE >> SCTRSHFT;
511 			st.ste_length = (off_t)0;
512 			st.ste_pages = 0;
513 			st.ste_free = 0;
514 			st.ste_flags = 0;
515 			mutex_enter(&swapinfo_lock);
516 			for (sip = swapinfo, nswap = 0;
517 			    sip != NULL && nswap < nswapfiles;
518 			    sip = sip->si_next, nswap++) {
519 				st.ste_length +=
520 				    (sip->si_eoff - sip->si_soff) >> SCTRSHFT;
521 				st.ste_pages += sip->si_npgs;
522 				st.ste_free += sip->si_nfpgs;
523 			}
524 			mutex_exit(&swapinfo_lock);
525 			if (copyout(&st, ust, sizeof (swapent_t)) != 0 ||
526 			    copyout(swappath, st.ste_path,
527 			    strlen(swappath) + 1) != 0) {
528 				return (EFAULT);
529 			}
530 			*rv = 1;
531 			return (0);
532 		}
533 beginning:
534 		tmp_nswapfiles = nswapfiles;
535 		/* Return an error if not enough space for the whole table. */
536 		if (length < tmp_nswapfiles)
537 			return (ENOMEM);
538 		/*
539 		 * Get memory to hold the swap entries and their names. We'll
540 		 * copy the real entries into these and then copy these out.
541 		 * Allocating the pathname memory is only a guess so we may
542 		 * find that we need more and have to do it again.
543 		 * All this is because we have to hold the anon lock while
544 		 * traversing the swapinfo list, and we can't be doing copyouts
545 		 * and/or kmem_alloc()s during this.
546 		 */
547 		csip = kmem_zalloc(tmp_nswapfiles * sizeof (struct swapinfo),
548 		    KM_SLEEP);
549 retry:
550 		nlen = tmp_nswapfiles * (gplen += 100);
551 		pname = kmem_zalloc(nlen, KM_SLEEP);
552 
553 		mutex_enter(&swapinfo_lock);
554 
555 		if (tmp_nswapfiles != nswapfiles) {
556 			mutex_exit(&swapinfo_lock);
557 			kmem_free(pname, nlen);
558 			kmem_free(csip,
559 			    tmp_nswapfiles * sizeof (struct swapinfo));
560 			gplen = 0;
561 			goto beginning;
562 		}
563 		for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
564 		    sip && nswap < tmp_nswapfiles;
565 		    sip = sip->si_next, tsip++, tpname += plen, nswap++) {
566 			plen = sip->si_pnamelen;
567 			if (tpname + plen - pname > nlen) {
568 				mutex_exit(&swapinfo_lock);
569 				kmem_free(pname, nlen);
570 				goto retry;
571 			}
572 			*tsip = *sip;
573 			tsip->si_pname = tpname;
574 			(void) strcpy(tsip->si_pname, sip->si_pname);
575 		}
576 		mutex_exit(&swapinfo_lock);
577 
578 		if (sip) {
579 			error = ENOMEM;
580 			goto lout;
581 		}
582 		ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
583 		for (tsip = csip, cnt = 0; cnt < nswap;  tsip++, ust++, cnt++) {
584 			if (copyin(ust, &st, sizeof (swapent_t)) != 0) {
585 				error = EFAULT;
586 				goto lout;
587 			}
588 			st.ste_flags = tsip->si_flags;
589 			st.ste_length =
590 			    (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
591 			st.ste_start = tsip->si_soff >> SCTRSHFT;
592 			st.ste_pages = tsip->si_npgs;
593 			st.ste_free = tsip->si_nfpgs;
594 			if (copyout(&st, ust, sizeof (swapent_t)) != 0) {
595 				error = EFAULT;
596 				goto lout;
597 			}
598 			if (!tsip->si_pnamelen)
599 				continue;
600 			if (copyout(tsip->si_pname, st.ste_path,
601 			    tsip->si_pnamelen) != 0) {
602 				error = EFAULT;
603 				goto lout;
604 			}
605 		}
606 		*rv = nswap;
607 lout:
608 		kmem_free(csip, tmp_nswapfiles * sizeof (struct swapinfo));
609 		kmem_free(pname, nlen);
610 		return (error);
611 
612 	case SC_ADD:
613 	case SC_REMOVE:
614 		break;
615 	default:
616 		return (EINVAL);
617 	}
618 	if ((error = secpolicy_swapctl(CRED())) != 0)
619 		return (error);
620 
621 	if (copyin(sc_arg, &sr, sizeof (swapres_t)))
622 		return (EFAULT);
623 
624 	/* Allocate the space to read in pathname */
625 	if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
626 		return (ENOMEM);
627 
628 	error = copyinstr(sr.sr_name, swapname, MAXPATHLEN, 0);
629 	if (error)
630 		goto out;
631 
632 	error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
633 	if (error) {
634 		if (sc_cmd == SC_ADD)
635 			goto out;
636 		/* see if we match by name */
637 		vp = swapdel_byname(swapname, (size_t)sr.sr_start);
638 		if (vp == NULL)
639 			goto out;
640 	}
641 
642 	if (vp->v_flag & (VNOMAP | VNOSWAP)) {
643 		VN_RELE(vp);
644 		error = ENOSYS;
645 		goto out;
646 	}
647 	switch (vp->v_type) {
648 	case VBLK:
649 		break;
650 
651 	case VREG:
652 		if (vp->v_vfsp && vn_is_readonly(vp))
653 			error = EROFS;
654 		else
655 			error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL);
656 		break;
657 
658 	case VDIR:
659 		error = EISDIR;
660 		break;
661 	default:
662 		error = ENOSYS;
663 		break;
664 	}
665 	if (error == 0) {
666 		if (sc_cmd == SC_REMOVE)
667 			error = swapdel(vp, sr.sr_start);
668 		else
669 			error = swapadd(vp, sr.sr_start,
670 			    sr.sr_length, swapname);
671 	}
672 	VN_RELE(vp);
673 out:
674 	kmem_free(swapname, MAXPATHLEN);
675 	return (error);
676 }
677 
678 #if defined(_LP64) && defined(_SYSCALL32)
679 
680 int
681 swapctl32(int sc_cmd, void *sc_arg, int *rv)
682 {
683 	struct swapinfo *sip, *csip, *tsip;
684 	int error = 0;
685 	struct swapent32 st, *ust;
686 	struct swapres32 sr;
687 	struct vnode *vp;
688 	int cnt = 0;
689 	int tmp_nswapfiles;
690 	int nswap;
691 	int length, nlen;
692 	int gplen = 0, plen;
693 	char *swapname;
694 	char *pname;
695 	char *tpname;
696 	struct anoninfo32 ai;
697 	size_t s;
698 	spgcnt_t avail;
699 
700 	switch (sc_cmd) {
701 	case SC_GETNSWP:
702 		*rv = nswapfiles;
703 		return (0);
704 
705 	case SC_AINFO:
706 		/*
707 		 * Return anoninfo information with these changes:
708 		 * ani_max = maximum amount of swap space
709 		 *	(including potentially available physical memory)
710 		 * ani_free = amount of unallocated anonymous memory
711 		 *	(some of which might be reserved and including
712 		 *	 potentially available physical memory)
713 		 * ani_resv = amount of claimed (reserved) anonymous memory
714 		 */
715 		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
716 		s = (k_anoninfo.ani_max + k_anoninfo.ani_mem_resv) + avail;
717 		if (s > UINT32_MAX)
718 			return (EOVERFLOW);
719 		ai.ani_max = s;
720 
721 		s = k_anoninfo.ani_free + avail;
722 		if (s > UINT32_MAX)
723 			return (EOVERFLOW);
724 		ai.ani_free = s;
725 
726 		s = k_anoninfo.ani_phys_resv + k_anoninfo.ani_mem_resv;
727 		if (s > UINT32_MAX)
728 			return (EOVERFLOW);
729 		ai.ani_resv = s;
730 
731 		if (copyout(&ai, sc_arg, sizeof (ai)) != 0)
732 			return (EFAULT);
733 		return (0);
734 
735 	case SC_LIST:
736 		if (copyin(sc_arg, &length, sizeof (int32_t)) != 0)
737 			return (EFAULT);
738 beginning:
739 		tmp_nswapfiles = nswapfiles;
740 		/* Return an error if not enough space for the whole table. */
741 		if (length < tmp_nswapfiles)
742 			return (ENOMEM);
743 		/*
744 		 * Get memory to hold the swap entries and their names. We'll
745 		 * copy the real entries into these and then copy these out.
746 		 * Allocating the pathname memory is only a guess so we may
747 		 * find that we need more and have to do it again.
748 		 * All this is because we have to hold the anon lock while
749 		 * traversing the swapinfo list, and we can't be doing copyouts
750 		 * and/or kmem_alloc()s during this.
751 		 */
752 		csip = kmem_zalloc(tmp_nswapfiles * sizeof (*csip), KM_SLEEP);
753 retry:
754 		nlen = tmp_nswapfiles * (gplen += 100);
755 		pname = kmem_zalloc(nlen, KM_SLEEP);
756 
757 		mutex_enter(&swapinfo_lock);
758 
759 		if (tmp_nswapfiles != nswapfiles) {
760 			mutex_exit(&swapinfo_lock);
761 			kmem_free(pname, nlen);
762 			kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
763 			gplen = 0;
764 			goto beginning;
765 		}
766 		for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
767 		    (sip != NULL) && (nswap < tmp_nswapfiles);
768 		    sip = sip->si_next, tsip++, tpname += plen, nswap++) {
769 			plen = sip->si_pnamelen;
770 			if (tpname + plen - pname > nlen) {
771 				mutex_exit(&swapinfo_lock);
772 				kmem_free(pname, nlen);
773 				goto retry;
774 			}
775 			*tsip = *sip;
776 			tsip->si_pname = tpname;
777 			(void) strcpy(tsip->si_pname, sip->si_pname);
778 		}
779 		mutex_exit(&swapinfo_lock);
780 
781 		if (sip != NULL) {
782 			error = ENOMEM;
783 			goto lout;
784 		}
785 		ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent;
786 		for (tsip = csip, cnt = 0; cnt < nswap;  tsip++, ust++, cnt++) {
787 			if (copyin(ust, &st, sizeof (*ust)) != 0) {
788 				error = EFAULT;
789 				goto lout;
790 			}
791 			st.ste_flags = tsip->si_flags;
792 			st.ste_length =
793 			    (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
794 			st.ste_start = tsip->si_soff >> SCTRSHFT;
795 			st.ste_pages = tsip->si_npgs;
796 			st.ste_free = tsip->si_nfpgs;
797 			if (copyout(&st, ust, sizeof (st)) != 0) {
798 				error = EFAULT;
799 				goto lout;
800 			}
801 			if (!tsip->si_pnamelen)
802 				continue;
803 			if (copyout(tsip->si_pname,
804 			    (caddr_t)(uintptr_t)st.ste_path,
805 			    tsip->si_pnamelen) != 0) {
806 				error = EFAULT;
807 				goto lout;
808 			}
809 		}
810 		*rv = nswap;
811 lout:
812 		kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
813 		kmem_free(pname, nlen);
814 		return (error);
815 
816 	case SC_ADD:
817 	case SC_REMOVE:
818 		break;
819 	default:
820 		return (EINVAL);
821 	}
822 	if ((error = secpolicy_swapctl(CRED())) != 0)
823 		return (error);
824 
825 	if (copyin(sc_arg, &sr, sizeof (sr)))
826 		return (EFAULT);
827 
828 	/* Allocate the space to read in pathname */
829 	if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
830 		return (ENOMEM);
831 
832 	error = copyinstr((caddr_t)(uintptr_t)sr.sr_name,
833 	    swapname, MAXPATHLEN, NULL);
834 	if (error)
835 		goto out;
836 
837 	error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
838 	if (error) {
839 		if (sc_cmd == SC_ADD)
840 			goto out;
841 		/* see if we match by name */
842 		vp = swapdel_byname(swapname, (uint_t)sr.sr_start);
843 		if (vp == NULL)
844 			goto out;
845 	}
846 
847 	if (vp->v_flag & (VNOMAP | VNOSWAP)) {
848 		VN_RELE(vp);
849 		error = ENOSYS;
850 		goto out;
851 	}
852 	switch (vp->v_type) {
853 	case VBLK:
854 		break;
855 
856 	case VREG:
857 		if (vp->v_vfsp && vn_is_readonly(vp))
858 			error = EROFS;
859 		else
860 			error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL);
861 		break;
862 
863 	case VDIR:
864 		error = EISDIR;
865 		break;
866 	default:
867 		error = ENOSYS;
868 		break;
869 	}
870 	if (error == 0) {
871 		if (sc_cmd == SC_REMOVE)
872 			error = swapdel(vp, sr.sr_start);
873 		else
874 			error = swapadd(vp, sr.sr_start, sr.sr_length,
875 			    swapname);
876 	}
877 	VN_RELE(vp);
878 out:
879 	kmem_free(swapname, MAXPATHLEN);
880 	return (error);
881 }
882 
883 #endif /* _LP64 && _SYSCALL32 */
884 
885 /*
886  * Add a new swap file.
887  */
888 int
889 swapadd(struct vnode *vp, ulong_t lowblk, ulong_t nblks, char *swapname)
890 {
891 	struct swapinfo **sipp, *nsip = NULL, *esip = NULL;
892 	struct vnode *cvp;
893 	struct vattr vattr;
894 	pgcnt_t pages;
895 	u_offset_t soff, eoff;
896 	int error;
897 	ssize_t i, start, end;
898 	ushort_t wasswap;
899 	ulong_t startblk;
900 	size_t	returned_mem;
901 
902 	SWAP_PRINT(SW_CTL, "swapadd: vp %p lowblk %ld nblks %ld swapname %s\n",
903 	    vp, lowblk, nblks, swapname, 0);
904 	/*
905 	 * Get the real vnode. (If vp is not a specnode it just returns vp, so
906 	 * it does the right thing, but having this code know about specnodes
907 	 * violates the spirit of having it be indepedent of vnode type.)
908 	 */
909 	cvp = common_specvp(vp);
910 
911 	/*
912 	 * Or in VISSWAP so file system has chance to deny swap-ons during open.
913 	 */
914 	mutex_enter(&cvp->v_lock);
915 	wasswap = cvp->v_flag & VISSWAP;
916 	cvp->v_flag |= VISSWAP;
917 	mutex_exit(&cvp->v_lock);
918 
919 	mutex_enter(&swap_lock);
920 	if (error = VOP_OPEN(&cvp, FREAD|FWRITE, CRED(), NULL)) {
921 		mutex_exit(&swap_lock);
922 		/* restore state of v_flag */
923 		if (!wasswap) {
924 			mutex_enter(&cvp->v_lock);
925 			cvp->v_flag &= ~VISSWAP;
926 			mutex_exit(&cvp->v_lock);
927 		}
928 		return (error);
929 	}
930 	mutex_exit(&swap_lock);
931 
932 	/*
933 	 * Get partition size. Return error if empty partition,
934 	 * or if request does not fit within the partition.
935 	 * If this is the first swap device, we can reduce
936 	 * the size of the swap area to match what is
937 	 * available.  This can happen if the system was built
938 	 * on a machine with a different size swap partition.
939 	 */
940 	vattr.va_mask = AT_SIZE;
941 	if (error = VOP_GETATTR(cvp, &vattr, ATTR_COMM, CRED(), NULL))
942 		goto out;
943 
944 	/*
945 	 * Specfs returns a va_size of MAXOFFSET_T (UNKNOWN_SIZE) when the
946 	 * size of the device can't be determined.
947 	 */
948 	if ((vattr.va_size == 0) || (vattr.va_size == MAXOFFSET_T)) {
949 		error = EINVAL;
950 		goto out;
951 	}
952 
953 #ifdef	_ILP32
954 	/*
955 	 * No support for large swap in 32-bit OS, if the size of the swap is
956 	 * bigger than MAXOFF32_T then the size used by swapfs must be limited.
957 	 * This limitation is imposed by the swap subsystem itself, a D_64BIT
958 	 * driver as the target of swap operation should be able to field
959 	 * the IO.
960 	 */
961 	if (vattr.va_size > MAXOFF32_T) {
962 		cmn_err(CE_NOTE,
963 		    "!swap device %s truncated from 0x%llx to 0x%x bytes",
964 		    swapname, vattr.va_size, MAXOFF32_T);
965 		vattr.va_size = MAXOFF32_T;
966 	}
967 #endif	/* _ILP32 */
968 
969 	/* Fail if file not writeable (try to set size to current size) */
970 	vattr.va_mask = AT_SIZE;
971 	if (error = VOP_SETATTR(cvp, &vattr, 0, CRED(), NULL))
972 		goto out;
973 
974 	/* Fail if fs does not support VOP_PAGEIO */
975 	error = VOP_PAGEIO(cvp, (page_t *)NULL, (u_offset_t)0, 0, 0, CRED(),
976 	    NULL);
977 
978 	if (error == ENOSYS)
979 		goto out;
980 	else
981 		error = 0;
982 	/*
983 	 * If swapping on the root filesystem don't put swap blocks that
984 	 * correspond to the miniroot filesystem on the swap free list.
985 	 */
986 	if (cvp == rootdir)
987 		startblk = roundup(MINIROOTSIZE<<SCTRSHFT, klustsize)>>SCTRSHFT;
988 	else				/* Skip 1st page (disk label) */
989 		startblk = (ulong_t)(lowblk ? lowblk : 1);
990 
991 	soff = startblk << SCTRSHFT;
992 	if (soff >= vattr.va_size) {
993 		error = EINVAL;
994 		goto out;
995 	}
996 
997 	/*
998 	 * If user specified 0 blks, use the size of the device
999 	 */
1000 	eoff = nblks ?  soff + (nblks - (startblk - lowblk) << SCTRSHFT) :
1001 	    vattr.va_size;
1002 
1003 	SWAP_PRINT(SW_CTL, "swapadd: va_size %ld soff %ld eoff %ld\n",
1004 	    vattr.va_size, soff, eoff, 0, 0);
1005 
1006 	if (eoff > vattr.va_size) {
1007 		error = EINVAL;
1008 		goto out;
1009 	}
1010 
1011 	/*
1012 	 * The starting and ending offsets must be page aligned.
1013 	 * Round soff up to next page boundary, round eoff
1014 	 * down to previous page boundary.
1015 	 */
1016 	soff = ptob(btopr(soff));
1017 	eoff = ptob(btop(eoff));
1018 	if (soff >= eoff) {
1019 		SWAP_PRINT(SW_CTL, "swapadd: soff %ld >= eoff %ld\n",
1020 		    soff, eoff, 0, 0, 0);
1021 		error = EINVAL;
1022 		goto out;
1023 	}
1024 
1025 	pages = btop(eoff - soff);
1026 
1027 	/* Allocate and partially set up the new swapinfo */
1028 	nsip = kmem_zalloc(sizeof (struct swapinfo), KM_SLEEP);
1029 	nsip->si_vp = cvp;
1030 
1031 	nsip->si_soff = soff;
1032 	nsip->si_eoff = eoff;
1033 	nsip->si_hint = 0;
1034 	nsip->si_checkcnt = nsip->si_alloccnt = 0;
1035 
1036 	nsip->si_pnamelen = (int)strlen(swapname) + 1;
1037 	nsip->si_pname = (char *)kmem_zalloc(nsip->si_pnamelen, KM_SLEEP);
1038 	bcopy(swapname, nsip->si_pname, nsip->si_pnamelen - 1);
1039 	SWAP_PRINT(SW_CTL, "swapadd: allocating swapinfo for %s, %ld pages\n",
1040 	    swapname, pages, 0, 0, 0);
1041 	/*
1042 	 * Size of swapslots map in bytes
1043 	 */
1044 	nsip->si_mapsize = P2ROUNDUP(pages, NBBW) / NBBY;
1045 	nsip->si_swapslots = kmem_zalloc(nsip->si_mapsize, KM_SLEEP);
1046 
1047 	/*
1048 	 * Permanently set the bits that can't ever be allocated,
1049 	 * i.e. those from the ending offset to the round up slot for the
1050 	 * swapslots bit map.
1051 	 */
1052 	start = pages;
1053 	end = P2ROUNDUP(pages, NBBW);
1054 	for (i = start; i < end; i++) {
1055 		SWAP_PRINT(SW_CTL, "swapadd: set bit for page %ld\n", i,
1056 		    0, 0, 0, 0);
1057 		SETBIT(nsip->si_swapslots, i);
1058 	}
1059 	nsip->si_npgs = nsip->si_nfpgs = pages;
1060 	/*
1061 	 * Now check to see if we can add it. We wait til now to check because
1062 	 * we need the swapinfo_lock and we don't want sleep with it (e.g.,
1063 	 * during kmem_alloc()) while we're setting up the swapinfo.
1064 	 */
1065 	mutex_enter(&swapinfo_lock);
1066 	for (sipp = &swapinfo; (esip = *sipp) != NULL; sipp = &esip->si_next) {
1067 		if (esip->si_vp == cvp) {
1068 			if (esip->si_soff == soff && esip->si_npgs == pages &&
1069 			    (esip->si_flags & ST_DOINGDEL)) {
1070 				/*
1071 				 * We are adding a device that we are in the
1072 				 * middle of deleting. Just clear the
1073 				 * ST_DOINGDEL flag to signal this and
1074 				 * the deletion routine will eventually notice
1075 				 * it and add it back.
1076 				 */
1077 				esip->si_flags &= ~ST_DOINGDEL;
1078 				mutex_exit(&swapinfo_lock);
1079 				goto out;
1080 			}
1081 			/* disallow overlapping swap files */
1082 			if ((soff < esip->si_eoff) && (eoff > esip->si_soff)) {
1083 				error = EEXIST;
1084 				mutex_exit(&swapinfo_lock);
1085 				goto out;
1086 			}
1087 		}
1088 	}
1089 
1090 	nswapfiles++;
1091 
1092 	/*
1093 	 * add new swap device to list and shift allocations to it
1094 	 * before updating the anoninfo counters
1095 	 */
1096 	*sipp = nsip;
1097 	silast = nsip;
1098 
1099 	/*
1100 	 * Update the total amount of reservable swap space
1101 	 * accounting properly for swap space from physical memory
1102 	 */
1103 	/* New swap device soaks up currently reserved memory swap */
1104 	mutex_enter(&anoninfo_lock);
1105 
1106 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
1107 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
1108 
1109 	k_anoninfo.ani_max += pages;
1110 	ANI_ADD(pages);
1111 	if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
1112 		returned_mem = MIN(k_anoninfo.ani_mem_resv -
1113 		    k_anoninfo.ani_locked_swap,
1114 		    k_anoninfo.ani_max - k_anoninfo.ani_phys_resv);
1115 
1116 		ANI_ADD(-returned_mem);
1117 		k_anoninfo.ani_free -= returned_mem;
1118 		k_anoninfo.ani_mem_resv -= returned_mem;
1119 		k_anoninfo.ani_phys_resv += returned_mem;
1120 
1121 		mutex_enter(&freemem_lock);
1122 		availrmem += returned_mem;
1123 		mutex_exit(&freemem_lock);
1124 	}
1125 	/*
1126 	 * At boot time, to permit booting small memory machines using
1127 	 * only physical memory as swap space, we allowed a dangerously
1128 	 * large amount of memory to be used as swap space; now that
1129 	 * more physical backing store is available bump down the amount
1130 	 * we can get from memory to a safer size.
1131 	 */
1132 	if (swapfs_minfree < swapfs_desfree) {
1133 		mutex_enter(&freemem_lock);
1134 		if (availrmem > swapfs_desfree || !k_anoninfo.ani_mem_resv)
1135 			swapfs_minfree = swapfs_desfree;
1136 		mutex_exit(&freemem_lock);
1137 	}
1138 
1139 	SWAP_PRINT(SW_CTL, "swapadd: ani_max %ld ani_free %ld\n",
1140 	    k_anoninfo.ani_free, k_anoninfo.ani_free, 0, 0, 0);
1141 
1142 	mutex_exit(&anoninfo_lock);
1143 
1144 	mutex_exit(&swapinfo_lock);
1145 
1146 	/* Initialize the dump device */
1147 	mutex_enter(&dump_lock);
1148 	if (dumpvp == NULL)
1149 		(void) dumpinit(vp, swapname, 0);
1150 	mutex_exit(&dump_lock);
1151 
1152 	VN_HOLD(cvp);
1153 out:
1154 	if (error || esip) {
1155 		SWAP_PRINT(SW_CTL, "swapadd: error (%d)\n", error, 0, 0, 0, 0);
1156 
1157 		if (!wasswap) {
1158 			mutex_enter(&cvp->v_lock);
1159 			cvp->v_flag &= ~VISSWAP;
1160 			mutex_exit(&cvp->v_lock);
1161 		}
1162 		if (nsip) {
1163 			kmem_free(nsip->si_swapslots, (size_t)nsip->si_mapsize);
1164 			kmem_free(nsip->si_pname, nsip->si_pnamelen);
1165 			kmem_free(nsip, sizeof (*nsip));
1166 		}
1167 		mutex_enter(&swap_lock);
1168 		(void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED(),
1169 		    NULL);
1170 		mutex_exit(&swap_lock);
1171 	}
1172 	return (error);
1173 }
1174 
1175 /*
1176  * Delete a swap file.
1177  */
1178 static int
1179 swapdel(
1180 	struct vnode *vp,
1181 	ulong_t lowblk) /* Low block number of area to delete. */
1182 {
1183 	struct swapinfo **sipp, *osip = NULL;
1184 	struct vnode *cvp;
1185 	u_offset_t soff;
1186 	int error = 0;
1187 	u_offset_t toff = 0;
1188 	struct vnode *tvp = NULL;
1189 	spgcnt_t pages;
1190 	struct anon **app, *ap;
1191 	kmutex_t *ahm;
1192 	pgcnt_t adjust_swap = 0;
1193 
1194 	/* Find the swap file entry for the file to be deleted */
1195 	cvp = common_specvp(vp);
1196 
1197 
1198 	lowblk = lowblk ? lowblk : 1; 	/* Skip first page (disk label) */
1199 	soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
1200 
1201 	mutex_enter(&swapinfo_lock);
1202 	for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
1203 		if ((osip->si_vp == cvp) &&
1204 		    (osip->si_soff == soff) && (osip->si_flags == 0))
1205 			break;
1206 	}
1207 
1208 	/* If the file was not found, error.  */
1209 	if (osip == NULL) {
1210 		error = EINVAL;
1211 		mutex_exit(&swapinfo_lock);
1212 		goto out;
1213 	}
1214 
1215 	pages = osip->si_npgs;
1216 
1217 	/*
1218 	 * Do not delete if we will be low on swap pages.
1219 	 */
1220 	mutex_enter(&anoninfo_lock);
1221 
1222 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
1223 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
1224 
1225 	mutex_enter(&freemem_lock);
1226 	if (((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) +
1227 	    MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) < pages) {
1228 		mutex_exit(&freemem_lock);
1229 		mutex_exit(&anoninfo_lock);
1230 		error = ENOMEM;
1231 		cmn_err(CE_WARN, "swapdel - too few free pages");
1232 		mutex_exit(&swapinfo_lock);
1233 		goto out;
1234 	}
1235 	mutex_exit(&freemem_lock);
1236 
1237 	k_anoninfo.ani_max -= pages;
1238 
1239 	/* If needed, reserve memory swap to replace old device */
1240 	if (k_anoninfo.ani_phys_resv > k_anoninfo.ani_max) {
1241 		adjust_swap = k_anoninfo.ani_phys_resv - k_anoninfo.ani_max;
1242 		k_anoninfo.ani_phys_resv -= adjust_swap;
1243 		k_anoninfo.ani_mem_resv += adjust_swap;
1244 		mutex_enter(&freemem_lock);
1245 		availrmem -= adjust_swap;
1246 		mutex_exit(&freemem_lock);
1247 		ANI_ADD(adjust_swap);
1248 	}
1249 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
1250 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
1251 	mutex_exit(&anoninfo_lock);
1252 
1253 	ANI_ADD(-pages);
1254 
1255 	/*
1256 	 * Set the delete flag.  This prevents anyone from allocating more
1257 	 * pages from this file. Also set ST_DOINGDEL. Someone who wants to
1258 	 * add the file back while we're deleting it will signify by clearing
1259 	 * this flag.
1260 	 */
1261 	osip->si_flags |= ST_INDEL|ST_DOINGDEL;
1262 	mutex_exit(&swapinfo_lock);
1263 
1264 	/*
1265 	 * Free all the allocated physical slots for this file. We do this
1266 	 * by walking through the entire anon hash array, because we need
1267 	 * to update all the anon slots that have physical swap slots on
1268 	 * this file, and this is the only way to find them all. We go back
1269 	 * to the beginning of a bucket after each slot is freed because the
1270 	 * anonhash_lock is not held during the free and thus the hash table
1271 	 * may change under us.
1272 	 */
1273 	for (app = anon_hash; app < &anon_hash[ANON_HASH_SIZE]; app++) {
1274 		ahm = &anonhash_lock[(app-anon_hash) & (AH_LOCK_SIZE - 1)];
1275 		mutex_enter(ahm);
1276 top:
1277 		for (ap = *app; ap != NULL; ap = ap->an_hash) {
1278 			if (ap->an_pvp == cvp &&
1279 			    ap->an_poff >= osip->si_soff &&
1280 			    ap->an_poff < osip->si_eoff) {
1281 				ASSERT(TESTBIT(osip->si_swapslots,
1282 				    btop((size_t)(ap->an_poff -
1283 				    osip->si_soff))));
1284 				tvp = ap->an_vp;
1285 				toff = ap->an_off;
1286 				VN_HOLD(tvp);
1287 				mutex_exit(ahm);
1288 
1289 				error = swapslot_free(tvp, toff, osip);
1290 
1291 				VN_RELE(tvp);
1292 				mutex_enter(ahm);
1293 				if (!error && (osip->si_flags & ST_DOINGDEL)) {
1294 					goto top;
1295 				} else {
1296 					if (error) {
1297 						cmn_err(CE_WARN,
1298 						    "swapslot_free failed %d",
1299 						    error);
1300 					}
1301 
1302 					/*
1303 					 * Add device back before making it
1304 					 * visible.
1305 					 */
1306 					mutex_enter(&swapinfo_lock);
1307 					osip->si_flags &=
1308 					    ~(ST_INDEL | ST_DOINGDEL);
1309 					mutex_exit(&swapinfo_lock);
1310 
1311 					/*
1312 					 * Update the anon space available
1313 					 */
1314 					mutex_enter(&anoninfo_lock);
1315 
1316 					k_anoninfo.ani_phys_resv += adjust_swap;
1317 					k_anoninfo.ani_mem_resv -= adjust_swap;
1318 					k_anoninfo.ani_max += pages;
1319 
1320 					mutex_enter(&freemem_lock);
1321 					availrmem += adjust_swap;
1322 					mutex_exit(&freemem_lock);
1323 
1324 					mutex_exit(&anoninfo_lock);
1325 
1326 					ANI_ADD(pages);
1327 
1328 					mutex_exit(ahm);
1329 					goto out;
1330 				}
1331 			}
1332 		}
1333 		mutex_exit(ahm);
1334 	}
1335 
1336 	/* All done, they'd better all be free! */
1337 	mutex_enter(&swapinfo_lock);
1338 	ASSERT(osip->si_nfpgs == osip->si_npgs);
1339 
1340 	/* Now remove it from the swapinfo list */
1341 	for (sipp = &swapinfo; *sipp != NULL; sipp = &(*sipp)->si_next) {
1342 		if (*sipp == osip)
1343 			break;
1344 	}
1345 	ASSERT(*sipp);
1346 	*sipp = osip->si_next;
1347 	if (silast == osip)
1348 		if ((silast = osip->si_next) == NULL)
1349 			silast = swapinfo;
1350 	nswapfiles--;
1351 	mutex_exit(&swapinfo_lock);
1352 
1353 	kmem_free(osip->si_swapslots, osip->si_mapsize);
1354 	kmem_free(osip->si_pname, osip->si_pnamelen);
1355 	kmem_free(osip, sizeof (*osip));
1356 
1357 	mutex_enter(&dump_lock);
1358 	if (cvp == dumpvp)
1359 		dumpfini();
1360 	mutex_exit(&dump_lock);
1361 
1362 	/* Release the vnode */
1363 
1364 	mutex_enter(&swap_lock);
1365 	(void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED(), NULL);
1366 	mutex_enter(&cvp->v_lock);
1367 	cvp->v_flag &= ~VISSWAP;
1368 	mutex_exit(&cvp->v_lock);
1369 	VN_RELE(cvp);
1370 	mutex_exit(&swap_lock);
1371 out:
1372 	return (error);
1373 }
1374 
1375 /*
1376  * Free up a physical swap slot on swapinfo sip, currently in use by the
1377  * anonymous page whose name is (vp, off).
1378  */
1379 static int
1380 swapslot_free(
1381 	struct vnode *vp,
1382 	u_offset_t off,
1383 	struct swapinfo *sip)
1384 {
1385 	struct page *pp = NULL;
1386 	struct anon *ap = NULL;
1387 	int error = 0;
1388 	kmutex_t *ahm;
1389 	struct vnode *pvp = NULL;
1390 	u_offset_t poff;
1391 	int	alloc_pg = 0;
1392 
1393 	ASSERT(sip->si_vp != NULL);
1394 	/*
1395 	 * Get the page for the old swap slot if exists or create a new one.
1396 	 */
1397 again:
1398 	if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
1399 		pp = page_create_va(vp, off, PAGESIZE, PG_WAIT | PG_EXCL,
1400 		    segkmap, NULL);
1401 		if (pp == NULL)
1402 			goto again;
1403 		alloc_pg = 1;
1404 
1405 		error = swap_getphysname(vp, off, &pvp, &poff);
1406 		if (error || pvp != sip->si_vp || poff < sip->si_soff ||
1407 		    poff >= sip->si_eoff) {
1408 			page_io_unlock(pp);
1409 			/*LINTED: constant in conditional context*/
1410 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
1411 			return (0);
1412 		}
1413 
1414 		error = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ,
1415 		    CRED(), NULL);
1416 		if (error) {
1417 			page_io_unlock(pp);
1418 			if (error == EFAULT)
1419 				error = 0;
1420 			/*LINTED: constant in conditional context*/
1421 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
1422 			return (error);
1423 		}
1424 	}
1425 
1426 	/*
1427 	 * The anon could have been removed by anon_decref* and/or reallocated
1428 	 * by anon layer (an_pvp == NULL) with the same vp, off.
1429 	 * In this case the page which has been allocated needs to
1430 	 * be freed.
1431 	 */
1432 	if (!alloc_pg)
1433 		page_io_lock(pp);
1434 	ahm = &anonhash_lock[AH_LOCK(vp, off)];
1435 	mutex_enter(ahm);
1436 	ap = swap_anon(vp, off);
1437 	if ((ap == NULL || ap->an_pvp == NULL) && alloc_pg) {
1438 		mutex_exit(ahm);
1439 		page_io_unlock(pp);
1440 		/*LINTED: constant in conditional context*/
1441 		VN_DISPOSE(pp, B_INVAL, 0, kcred);
1442 		return (0);
1443 	}
1444 
1445 	/*
1446 	 * Free the physical slot. It may have been freed up and replaced with
1447 	 * another one while we were getting the page so we have to re-verify
1448 	 * that this is really one we want. If we do free the slot we have
1449 	 * to mark the page modified, as its backing store is now gone.
1450 	 */
1451 	if ((ap != NULL) && (ap->an_pvp == sip->si_vp && ap->an_poff >=
1452 	    sip->si_soff && ap->an_poff < sip->si_eoff)) {
1453 		swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1454 		ap->an_pvp = NULL;
1455 		ap->an_poff = 0;
1456 		mutex_exit(ahm);
1457 		hat_setmod(pp);
1458 	} else {
1459 		mutex_exit(ahm);
1460 	}
1461 	page_io_unlock(pp);
1462 	page_unlock(pp);
1463 	return (0);
1464 }
1465 
1466 
1467 /*
1468  * Get contig physical backing store for vp, in the range
1469  * [*offp, *offp + *lenp), May back a subrange of this, but must
1470  * always include the requested offset or fail. Returns the offsets
1471  * backed as [*offp, *offp + *lenp) and the physical offsets used to
1472  * back them from *pvpp in the range [*pstartp, *pstartp + *lenp).
1473  * Returns 	0 for success
1474  * 		SE_NOANON -- no anon slot for requested paged
1475  *		SE_NOSWAP -- no physical swap space available
1476  */
1477 int
1478 swap_newphysname(
1479 	struct vnode *vp,
1480 	u_offset_t offset,
1481 	u_offset_t *offp,
1482 	size_t *lenp,
1483 	struct vnode **pvpp,
1484 	u_offset_t *poffp)
1485 {
1486 	struct anon *ap = NULL;		/* anon slot for vp, off */
1487 	int error = 0;
1488 	struct vnode *pvp;
1489 	u_offset_t poff, pstart, prem;
1490 	size_t plen;
1491 	u_offset_t off, start;
1492 	kmutex_t *ahm;
1493 
1494 	ASSERT(*offp <= offset && offset < *offp + *lenp);
1495 
1496 	/* Get new physical swap slots. */
1497 	plen = *lenp;
1498 	if (!swap_phys_alloc(&pvp, &pstart, &plen, 0)) {
1499 		/*
1500 		 * No swap available so return error unless requested
1501 		 * offset is already backed in which case return that.
1502 		 */
1503 		ahm = &anonhash_lock[AH_LOCK(vp, offset)];
1504 		mutex_enter(ahm);
1505 		if ((ap = swap_anon(vp, offset)) == NULL) {
1506 			error = SE_NOANON;
1507 			mutex_exit(ahm);
1508 			return (error);
1509 		}
1510 		error = (ap->an_pvp ? 0 : SE_NOSWAP);
1511 		*offp = offset;
1512 		*lenp = PAGESIZE;
1513 		*pvpp = ap->an_pvp;
1514 		*poffp = ap->an_poff;
1515 		mutex_exit(ahm);
1516 		return (error);
1517 	}
1518 
1519 	/*
1520 	 * We got plen (<= *lenp) contig slots. Use these to back a
1521 	 * subrange of [*offp, *offp + *lenp) which includes offset.
1522 	 * For now we just put offset at the end of the kluster.
1523 	 * Clearly there are other possible choices - which is best?
1524 	 */
1525 	start = MAX(*offp,
1526 	    (offset + PAGESIZE > plen) ? (offset + PAGESIZE - plen) : 0);
1527 	ASSERT(start + plen <= *offp + *lenp);
1528 
1529 	for (off = start, poff = pstart; poff < pstart + plen;
1530 	    off += PAGESIZE, poff += PAGESIZE) {
1531 		ahm = &anonhash_lock[AH_LOCK(vp, off)];
1532 		mutex_enter(ahm);
1533 		if ((ap = swap_anon(vp, off)) != NULL) {
1534 			/* Free old slot if any, and assign new one */
1535 			if (ap->an_pvp)
1536 				swap_phys_free(ap->an_pvp, ap->an_poff,
1537 				    PAGESIZE);
1538 			ap->an_pvp = pvp;
1539 			ap->an_poff = poff;
1540 		} else {	/* No anon slot for a klustered page, quit. */
1541 			prem = (pstart + plen) - poff;
1542 			/* Already did requested page, do partial kluster */
1543 			if (off > offset) {
1544 				plen = poff - pstart;
1545 				error = 0;
1546 			/* Fail on requested page, error */
1547 			} else if (off == offset)  {
1548 				error = SE_NOANON;
1549 			/* Fail on prior page, fail on requested page, error */
1550 			} else if ((ap = swap_anon(vp, offset)) == NULL) {
1551 				error = SE_NOANON;
1552 			/* Fail on prior page, got requested page, do only it */
1553 			} else {
1554 				/* Free old slot if any, and assign new one */
1555 				if (ap->an_pvp)
1556 					swap_phys_free(ap->an_pvp, ap->an_poff,
1557 					    PAGESIZE);
1558 				ap->an_pvp = pvp;
1559 				ap->an_poff = poff;
1560 				/* One page kluster */
1561 				start = offset;
1562 				plen = PAGESIZE;
1563 				pstart = poff;
1564 				poff += PAGESIZE;
1565 				prem -= PAGESIZE;
1566 			}
1567 			/* Free unassigned slots */
1568 			swap_phys_free(pvp, poff, prem);
1569 			mutex_exit(ahm);
1570 			break;
1571 		}
1572 		mutex_exit(ahm);
1573 	}
1574 	ASSERT(*offp <= start && start + plen <= *offp + *lenp);
1575 	ASSERT(start <= offset && offset < start + plen);
1576 	*offp = start;
1577 	*lenp = plen;
1578 	*pvpp = pvp;
1579 	*poffp = pstart;
1580 	return (error);
1581 }
1582 
1583 
1584 /*
1585  * Get the physical swap backing store location for a given anonymous page
1586  * named (vp, off). The backing store name is returned in (*pvpp, *poffp).
1587  * Returns	0 		success
1588  *		EIDRM --	no anon slot (page is not allocated)
1589  */
1590 int
1591 swap_getphysname(
1592 	struct vnode *vp,
1593 	u_offset_t off,
1594 	struct vnode **pvpp,
1595 	u_offset_t *poffp)
1596 {
1597 	struct anon *ap;
1598 	int error = 0;
1599 	kmutex_t *ahm;
1600 
1601 	ahm = &anonhash_lock[AH_LOCK(vp, off)];
1602 	mutex_enter(ahm);
1603 
1604 	/* Get anon slot for vp, off */
1605 	ap = swap_anon(vp, off);
1606 	if (ap == NULL) {
1607 		error = EIDRM;
1608 		goto out;
1609 	}
1610 	*pvpp = ap->an_pvp;
1611 	*poffp = ap->an_poff;
1612 out:
1613 	mutex_exit(ahm);
1614 	return (error);
1615 }
1616