xref: /freebsd/sys/vm/swap_pager.c (revision 0c43d89a0d8e976ca494d4837f4c1f3734d2c300)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * Copyright (c) 1990 University of Utah.
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * the Systems Programming Group of the University of Utah Computer
9  * Science Department.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
40  *
41  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
42  * $Id: swap_pager.c,v 1.6 1994/08/07 13:10:37 davidg Exp $
43  */
44 
45 /*
46  * Quick hack to page to dedicated partition(s).
47  * TODO:
48  *	Add multiprocessor locks
49  *	Deal with async writes in a better fashion
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/proc.h>
55 #include <sys/buf.h>
56 #include <sys/vnode.h>
57 #include <sys/malloc.h>
58 
59 #include <miscfs/specfs/specdev.h>
60 #include <sys/rlist.h>
61 
62 #include <vm/vm.h>
63 #include <vm/vm_pager.h>
64 #include <vm/vm_page.h>
65 #include <vm/vm_pageout.h>
66 #include <vm/swap_pager.h>
67 
68 #ifndef NPENDINGIO
69 #define NPENDINGIO	16
70 #endif
71 
72 int nswiodone;
73 extern int vm_pageout_rate_limit;
74 static int cleandone;
75 extern int hz;
76 int swap_pager_full;
77 extern vm_map_t pager_map;
78 extern int vm_swap_size;
79 
80 #define MAX_PAGEOUT_CLUSTER 8
81 
82 TAILQ_HEAD(swpclean, swpagerclean);
83 
84 typedef	struct swpagerclean	*swp_clean_t;
85 
86 struct swpagerclean {
87 	TAILQ_ENTRY(swpagerclean)	spc_list;
88 	int				spc_flags;
89 	struct buf			*spc_bp;
90 	sw_pager_t			spc_swp;
91 	vm_offset_t			spc_kva;
92 	vm_offset_t			spc_altkva;
93 	int				spc_count;
94 	vm_page_t			spc_m[MAX_PAGEOUT_CLUSTER];
95 } swcleanlist [NPENDINGIO] ;
96 
97 
98 extern vm_map_t kernel_map;
99 
100 /* spc_flags values */
101 #define SPC_ERROR	0x01
102 
103 #define SWB_EMPTY (-1)
104 
105 struct swpclean swap_pager_done;	/* list of compileted page cleans */
106 struct swpclean swap_pager_inuse;	/* list of pending page cleans */
107 struct swpclean swap_pager_free;	/* list of free pager clean structs */
108 struct pagerlst swap_pager_list;	/* list of "named" anon regions */
109 struct pagerlst swap_pager_un_list;	/* list of "unnamed" anon pagers */
110 
111 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
112 int swap_pager_needflags;
113 struct rlist *swapfrag;
114 
115 struct pagerlst *swp_qs[]={
116 	&swap_pager_list, &swap_pager_un_list, (struct pagerlst *) 0
117 };
118 
119 int swap_pager_putmulti();
120 
121 struct pagerops swappagerops = {
122 	swap_pager_init,
123 	swap_pager_alloc,
124 	swap_pager_dealloc,
125 	swap_pager_getpage,
126 	swap_pager_getmulti,
127 	swap_pager_putpage,
128 	swap_pager_putmulti,
129 	swap_pager_haspage
130 };
131 
132 int npendingio = NPENDINGIO;
133 int pendingiowait;
134 int require_swap_init;
135 void swap_pager_finish();
136 int dmmin, dmmax;
137 extern int vm_page_count;
138 
139 struct buf * getpbuf() ;
140 void relpbuf(struct buf *bp) ;
141 
142 static inline void swapsizecheck() {
143 	if( vm_swap_size < 128*btodb(PAGE_SIZE)) {
144 		if( swap_pager_full)
145 			printf("swap_pager: out of space\n");
146 		swap_pager_full = 1;
147 	} else if( vm_swap_size > 192*btodb(PAGE_SIZE))
148 		swap_pager_full = 0;
149 }
150 
151 void
152 swap_pager_init()
153 {
154 	dfltpagerops = &swappagerops;
155 
156 	TAILQ_INIT(&swap_pager_list);
157 	TAILQ_INIT(&swap_pager_un_list);
158 
159 	/*
160 	 * Initialize clean lists
161 	 */
162 	TAILQ_INIT(&swap_pager_inuse);
163 	TAILQ_INIT(&swap_pager_done);
164 	TAILQ_INIT(&swap_pager_free);
165 
166 	require_swap_init = 1;
167 
168 	/*
169 	 * Calculate the swap allocation constants.
170 	 */
171 
172 	dmmin = CLBYTES/DEV_BSIZE;
173 	dmmax = btodb(SWB_NPAGES*PAGE_SIZE)*2;
174 
175 }
176 
177 /*
178  * Allocate a pager structure and associated resources.
179  * Note that if we are called from the pageout daemon (handle == NULL)
180  * we should not wait for memory as it could resulting in deadlock.
181  */
182 vm_pager_t
183 swap_pager_alloc(handle, size, prot, offset)
184 	caddr_t handle;
185 	register vm_size_t size;
186 	vm_prot_t prot;
187 	vm_offset_t offset;
188 {
189 	register vm_pager_t pager;
190 	register sw_pager_t swp;
191 	int waitok;
192 	int i,j;
193 
194 	if (require_swap_init) {
195 		swp_clean_t spc;
196 		struct buf *bp;
197 		/*
198 		 * kva's are allocated here so that we dont need to keep
199 		 * doing kmem_alloc pageables at runtime
200 		 */
201 		for (i = 0, spc = swcleanlist; i < npendingio ; i++, spc++) {
202 			spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE);
203 			if (!spc->spc_kva) {
204 				break;
205 			}
206 			spc->spc_bp = malloc( sizeof( *bp), M_TEMP, M_NOWAIT);
207 			if (!spc->spc_bp) {
208 				kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
209 				break;
210 			}
211 			spc->spc_flags = 0;
212 			TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
213 		}
214 		require_swap_init = 0;
215 		if( size == 0)
216 			return(NULL);
217 	}
218 
219 	/*
220 	 * If this is a "named" anonymous region, look it up and
221 	 * return the appropriate pager if it exists.
222 	 */
223 	if (handle) {
224 		pager = vm_pager_lookup(&swap_pager_list, handle);
225 		if (pager != NULL) {
226 			/*
227 			 * Use vm_object_lookup to gain a reference
228 			 * to the object and also to remove from the
229 			 * object cache.
230 			 */
231 			if (vm_object_lookup(pager) == NULL)
232 				panic("swap_pager_alloc: bad object");
233 			return(pager);
234 		}
235 	}
236 
237 	if (swap_pager_full) {
238 		return(NULL);
239 	}
240 
241 	/*
242 	 * Pager doesn't exist, allocate swap management resources
243 	 * and initialize.
244 	 */
245 	waitok = handle ? M_WAITOK : M_NOWAIT;
246 	pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok);
247 	if (pager == NULL)
248 		return(NULL);
249 	swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok);
250 	if (swp == NULL) {
251 		free((caddr_t)pager, M_VMPAGER);
252 		return(NULL);
253 	}
254 	size = round_page(size);
255 	swp->sw_osize = size;
256 	swp->sw_nblocks = (btodb(size) + btodb(SWB_NPAGES * PAGE_SIZE) - 1) / btodb(SWB_NPAGES*PAGE_SIZE);
257 	swp->sw_blocks = (sw_blk_t)
258 		malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks),
259 		       M_VMPGDATA, waitok);
260 	if (swp->sw_blocks == NULL) {
261 		free((caddr_t)swp, M_VMPGDATA);
262 		free((caddr_t)pager, M_VMPAGER);
263 		return(NULL);
264 	}
265 
266 	for (i = 0; i < swp->sw_nblocks; i++) {
267 		swp->sw_blocks[i].swb_valid = 0;
268 		swp->sw_blocks[i].swb_locked = 0;
269 		for (j = 0; j < SWB_NPAGES; j++)
270 			swp->sw_blocks[i].swb_block[j] = SWB_EMPTY;
271 	}
272 
273 	swp->sw_poip = 0;
274 	if (handle) {
275 		vm_object_t object;
276 
277 		swp->sw_flags = SW_NAMED;
278 		TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list);
279 		/*
280 		 * Consistant with other pagers: return with object
281 		 * referenced.  Can't do this with handle == NULL
282 		 * since it might be the pageout daemon calling.
283 		 */
284 		object = vm_object_allocate(size);
285 		vm_object_enter(object, pager);
286 		vm_object_setpager(object, pager, 0, FALSE);
287 	} else {
288 		swp->sw_flags = 0;
289 		TAILQ_INSERT_TAIL(&swap_pager_un_list, pager, pg_list);
290 	}
291 	pager->pg_handle = handle;
292 	pager->pg_ops = &swappagerops;
293 	pager->pg_type = PG_SWAP;
294 	pager->pg_data = (caddr_t)swp;
295 
296 	return(pager);
297 }
298 
299 /*
300  * returns disk block associated with pager and offset
301  * additionally, as a side effect returns a flag indicating
302  * if the block has been written
303  */
304 
305 static int *
306 swap_pager_diskaddr(swp, offset, valid)
307 	sw_pager_t swp;
308 	vm_offset_t offset;
309 	int *valid;
310 {
311 	register sw_blk_t swb;
312 	int ix;
313 
314 	if (valid)
315 		*valid = 0;
316 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
317 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
318 		return(FALSE);
319 	}
320 	swb = &swp->sw_blocks[ix];
321 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
322 	if (valid)
323 		*valid = swb->swb_valid & (1<<ix);
324 	return &swb->swb_block[ix];
325 }
326 
327 /*
328  * Utility routine to set the valid (written) bit for
329  * a block associated with a pager and offset
330  */
331 static void
332 swap_pager_setvalid(swp, offset, valid)
333 	sw_pager_t swp;
334 	vm_offset_t offset;
335 	int valid;
336 {
337 	register sw_blk_t swb;
338 	int ix;
339 
340 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
341 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks)
342 		return;
343 
344 	swb = &swp->sw_blocks[ix];
345 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
346 	if (valid)
347 		swb->swb_valid |= (1 << ix);
348 	else
349 		swb->swb_valid &= ~(1 << ix);
350 	return;
351 }
352 
353 /*
354  * this routine allocates swap space with a fragmentation
355  * minimization policy.
356  */
357 int
358 swap_pager_getswapspace( unsigned amount, unsigned *rtval) {
359 	unsigned tmpalloc;
360 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
361 	if( amount < nblocksfrag) {
362 		if( rlist_alloc(&swapfrag, amount, rtval))
363 			return 1;
364 		if( !rlist_alloc(&swapmap, nblocksfrag, &tmpalloc))
365 			return 0;
366 		rlist_free( &swapfrag, tmpalloc+amount, tmpalloc + nblocksfrag - 1);
367 		*rtval = tmpalloc;
368 		return 1;
369 	}
370 	if( !rlist_alloc(&swapmap, amount, rtval))
371 		return 0;
372 	else
373 		return 1;
374 }
375 
376 /*
377  * this routine frees swap space with a fragmentation
378  * minimization policy.
379  */
380 void
381 swap_pager_freeswapspace( unsigned from, unsigned to) {
382 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
383 	unsigned tmpalloc;
384 	if( ((to + 1) - from) >= nblocksfrag) {
385 		while( (from + nblocksfrag) <= to + 1) {
386 			rlist_free(&swapmap, from, from + nblocksfrag - 1);
387 			from += nblocksfrag;
388 		}
389 	}
390 	if( from >= to)
391 		return;
392 	rlist_free(&swapfrag, from, to);
393 	while( rlist_alloc(&swapfrag, nblocksfrag, &tmpalloc)) {
394 		rlist_free(&swapmap, tmpalloc, tmpalloc + nblocksfrag-1);
395 	}
396 }
397 /*
398  * this routine frees swap blocks from a specified pager
399  */
400 void
401 _swap_pager_freespace(swp, start, size)
402 	sw_pager_t swp;
403 	vm_offset_t start;
404 	vm_offset_t size;
405 {
406 	vm_offset_t i;
407 	int s;
408 
409 	s = splbio();
410 	for (i = start; i < round_page(start + size - 1); i += PAGE_SIZE) {
411 		int valid;
412 		int *addr = swap_pager_diskaddr(swp, i, &valid);
413 		if (addr && *addr != SWB_EMPTY) {
414 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
415 			if( valid) {
416 				vm_swap_size += btodb(PAGE_SIZE);
417 				swap_pager_setvalid(swp, i, 0);
418 			}
419 			*addr = SWB_EMPTY;
420 		}
421 	}
422 	swapsizecheck();
423 	splx(s);
424 }
425 
426 void
427 swap_pager_freespace(pager, start, size)
428 	vm_pager_t pager;
429 	vm_offset_t start;
430 	vm_offset_t size;
431 {
432 	_swap_pager_freespace((sw_pager_t) pager->pg_data, start, size);
433 }
434 
435 /*
436  * swap_pager_reclaim frees up over-allocated space from all pagers
437  * this eliminates internal fragmentation due to allocation of space
438  * for segments that are never swapped to. It has been written so that
439  * it does not block until the rlist_free operation occurs; it keeps
440  * the queues consistant.
441  */
442 
443 /*
444  * Maximum number of blocks (pages) to reclaim per pass
445  */
446 #define MAXRECLAIM 256
447 
448 void
449 swap_pager_reclaim()
450 {
451 	vm_pager_t p;
452 	sw_pager_t swp;
453 	int i, j, k;
454 	int s;
455 	int reclaimcount;
456 	static int reclaims[MAXRECLAIM];
457 	static int in_reclaim;
458 
459 /*
460  * allow only one process to be in the swap_pager_reclaim subroutine
461  */
462 	s = splbio();
463 	if (in_reclaim) {
464 		tsleep((caddr_t) &in_reclaim, PSWP, "swrclm", 0);
465 		splx(s);
466 		return;
467 	}
468 	in_reclaim = 1;
469 	reclaimcount = 0;
470 
471 	/* for each pager queue */
472 	for (k = 0; swp_qs[k]; k++) {
473 
474 		p = swp_qs[k]->tqh_first;
475 		while (p && (reclaimcount < MAXRECLAIM)) {
476 
477 			/*
478 			 * see if any blocks associated with a pager has been
479 			 * allocated but not used (written)
480 			 */
481 			swp = (sw_pager_t) p->pg_data;
482 			for (i = 0; i < swp->sw_nblocks; i++) {
483 				sw_blk_t swb = &swp->sw_blocks[i];
484 				if( swb->swb_locked)
485 					continue;
486 				for (j = 0; j < SWB_NPAGES; j++) {
487 					if (swb->swb_block[j] != SWB_EMPTY &&
488 						(swb->swb_valid & (1 << j)) == 0) {
489 						reclaims[reclaimcount++] = swb->swb_block[j];
490 						swb->swb_block[j] = SWB_EMPTY;
491 						if (reclaimcount >= MAXRECLAIM)
492 							goto rfinished;
493 					}
494 				}
495 			}
496 			p = p->pg_list.tqe_next;
497 		}
498 	}
499 
500 rfinished:
501 
502 /*
503  * free the blocks that have been added to the reclaim list
504  */
505 	for (i = 0; i < reclaimcount; i++) {
506 		swap_pager_freeswapspace(reclaims[i], reclaims[i]+btodb(PAGE_SIZE) - 1);
507 		swapsizecheck();
508 		wakeup((caddr_t) &in_reclaim);
509 	}
510 
511 	splx(s);
512 	in_reclaim = 0;
513 	wakeup((caddr_t) &in_reclaim);
514 }
515 
516 
517 /*
518  * swap_pager_copy copies blocks from one pager to another and
519  * destroys the source pager
520  */
521 
522 void
523 swap_pager_copy(srcpager, srcoffset, dstpager, dstoffset, offset)
524 	vm_pager_t srcpager;
525 	vm_offset_t srcoffset;
526 	vm_pager_t dstpager;
527 	vm_offset_t dstoffset;
528 	vm_offset_t offset;
529 {
530 	sw_pager_t srcswp, dstswp;
531 	vm_offset_t i;
532 	int s;
533 
534 	srcswp = (sw_pager_t) srcpager->pg_data;
535 	dstswp = (sw_pager_t) dstpager->pg_data;
536 
537 /*
538  * remove the source pager from the swap_pager internal queue
539  */
540 	s = splbio();
541 	if (srcswp->sw_flags & SW_NAMED) {
542 		TAILQ_REMOVE(&swap_pager_list, srcpager, pg_list);
543 		srcswp->sw_flags &= ~SW_NAMED;
544 	} else {
545 		TAILQ_REMOVE(&swap_pager_un_list, srcpager, pg_list);
546 	}
547 
548 	while (srcswp->sw_poip) {
549 		tsleep((caddr_t)srcswp, PVM, "spgout", 0);
550 	}
551 	splx(s);
552 
553 /*
554  * clean all of the pages that are currently active and finished
555  */
556 	(void) swap_pager_clean();
557 
558 	s = splbio();
559 /*
560  * clear source block before destination object
561  * (release allocated space)
562  */
563 	for (i = 0; i < offset + srcoffset; i += PAGE_SIZE) {
564 		int valid;
565 		int *addr = swap_pager_diskaddr(srcswp, i, &valid);
566 		if (addr && *addr != SWB_EMPTY) {
567 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
568 			if( valid)
569 				vm_swap_size += btodb(PAGE_SIZE);
570 			swapsizecheck();
571 			*addr = SWB_EMPTY;
572 		}
573 	}
574 /*
575  * transfer source to destination
576  */
577 	for (i = 0; i < dstswp->sw_osize; i += PAGE_SIZE) {
578 		int srcvalid, dstvalid;
579 		int *srcaddrp = swap_pager_diskaddr(srcswp, i + offset + srcoffset,
580 			&srcvalid);
581 		int *dstaddrp;
582 	/*
583 	 * see if the source has space allocated
584 	 */
585 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
586 		/*
587 		 * if the source is valid and the dest has no space, then
588 		 * copy the allocation from the srouce to the dest.
589 		 */
590 			if (srcvalid) {
591 				dstaddrp = swap_pager_diskaddr(dstswp, i + dstoffset, &dstvalid);
592 				/*
593 				 * if the dest already has a valid block, deallocate the
594 				 * source block without copying.
595 				 */
596 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
597 					swap_pager_freeswapspace(*dstaddrp, *dstaddrp+btodb(PAGE_SIZE) - 1);
598 					*dstaddrp = SWB_EMPTY;
599 				}
600 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
601 					*dstaddrp = *srcaddrp;
602 					*srcaddrp = SWB_EMPTY;
603 					swap_pager_setvalid(dstswp, i + dstoffset, 1);
604 					vm_swap_size -= btodb(PAGE_SIZE);
605 				}
606 			}
607 		/*
608 		 * if the source is not empty at this point, then deallocate the space.
609 		 */
610 			if (*srcaddrp != SWB_EMPTY) {
611 				swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
612 				if( srcvalid)
613 					vm_swap_size += btodb(PAGE_SIZE);
614 				*srcaddrp = SWB_EMPTY;
615 			}
616 		}
617 	}
618 
619 /*
620  * deallocate the rest of the source object
621  */
622 	for (i = dstswp->sw_osize + offset + srcoffset; i < srcswp->sw_osize; i += PAGE_SIZE) {
623 		int valid;
624 		int *srcaddrp = swap_pager_diskaddr(srcswp, i, &valid);
625 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
626 			swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
627 			if( valid)
628 				vm_swap_size += btodb(PAGE_SIZE);
629 			*srcaddrp = SWB_EMPTY;
630 		}
631 	}
632 
633 	swapsizecheck();
634 	splx(s);
635 
636 	free((caddr_t)srcswp->sw_blocks, M_VMPGDATA);
637 	srcswp->sw_blocks = 0;
638 	free((caddr_t)srcswp, M_VMPGDATA);
639 	srcpager->pg_data = 0;
640 	free((caddr_t)srcpager, M_VMPAGER);
641 
642 	return;
643 }
644 
645 
646 void
647 swap_pager_dealloc(pager)
648 	vm_pager_t pager;
649 {
650 	register int i,j;
651 	register sw_blk_t bp;
652 	register sw_pager_t swp;
653 	int s;
654 
655 	/*
656 	 * Remove from list right away so lookups will fail if we
657 	 * block for pageout completion.
658 	 */
659 	s = splbio();
660 	swp = (sw_pager_t) pager->pg_data;
661 	if (swp->sw_flags & SW_NAMED) {
662 		TAILQ_REMOVE(&swap_pager_list, pager, pg_list);
663 		swp->sw_flags &= ~SW_NAMED;
664 	} else {
665 		TAILQ_REMOVE(&swap_pager_un_list, pager, pg_list);
666 	}
667 	/*
668 	 * Wait for all pageouts to finish and remove
669 	 * all entries from cleaning list.
670 	 */
671 
672 	while (swp->sw_poip) {
673 		tsleep((caddr_t)swp, PVM, "swpout", 0);
674 	}
675 	splx(s);
676 
677 
678 	(void) swap_pager_clean();
679 
680 	/*
681 	 * Free left over swap blocks
682 	 */
683 	s = splbio();
684 	for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) {
685 		for (j = 0; j < SWB_NPAGES; j++)
686 		if (bp->swb_block[j] != SWB_EMPTY) {
687 			swap_pager_freeswapspace((unsigned)bp->swb_block[j],
688 				(unsigned)bp->swb_block[j] + btodb(PAGE_SIZE) - 1);
689 			if( bp->swb_valid & (1<<j))
690 				vm_swap_size += btodb(PAGE_SIZE);
691 			bp->swb_block[j] = SWB_EMPTY;
692 		}
693 	}
694 	splx(s);
695 	swapsizecheck();
696 
697 	/*
698 	 * Free swap management resources
699 	 */
700 	free((caddr_t)swp->sw_blocks, M_VMPGDATA);
701 	swp->sw_blocks = 0;
702 	free((caddr_t)swp, M_VMPGDATA);
703 	pager->pg_data = 0;
704 	free((caddr_t)pager, M_VMPAGER);
705 }
706 
707 /*
708  * swap_pager_getmulti can get multiple pages.
709  */
710 int
711 swap_pager_getmulti(pager, m, count, reqpage, sync)
712 	vm_pager_t pager;
713 	vm_page_t *m;
714 	int count;
715 	int reqpage;
716 	boolean_t sync;
717 {
718 	if( reqpage >= count)
719 		panic("swap_pager_getmulti: reqpage >= count\n");
720 	return swap_pager_input((sw_pager_t) pager->pg_data, m, count, reqpage);
721 }
722 
723 /*
724  * swap_pager_getpage gets individual pages
725  */
726 int
727 swap_pager_getpage(pager, m, sync)
728 	vm_pager_t pager;
729 	vm_page_t m;
730 	boolean_t sync;
731 {
732 	vm_page_t marray[1];
733 
734 	marray[0] = m;
735 	return swap_pager_input((sw_pager_t)pager->pg_data, marray, 1, 0);
736 }
737 
738 int
739 swap_pager_putmulti(pager, m, c, sync, rtvals)
740 	vm_pager_t pager;
741 	vm_page_t *m;
742 	int c;
743 	boolean_t sync;
744 	int *rtvals;
745 {
746 	int flags;
747 
748 	if (pager == NULL) {
749 		(void) swap_pager_clean();
750 		return VM_PAGER_OK;
751 	}
752 
753 	flags = B_WRITE;
754 	if (!sync)
755 		flags |= B_ASYNC;
756 
757 	return swap_pager_output((sw_pager_t)pager->pg_data, m, c, flags, rtvals);
758 }
759 
760 /*
761  * swap_pager_putpage writes individual pages
762  */
763 int
764 swap_pager_putpage(pager, m, sync)
765 	vm_pager_t pager;
766 	vm_page_t m;
767 	boolean_t sync;
768 {
769 	int flags;
770 	vm_page_t marray[1];
771 	int rtvals[1];
772 
773 
774 	if (pager == NULL) {
775 		(void) swap_pager_clean();
776 		return VM_PAGER_OK;
777 	}
778 
779 	marray[0] = m;
780 	flags = B_WRITE;
781 	if (!sync)
782 		flags |= B_ASYNC;
783 
784 	swap_pager_output((sw_pager_t)pager->pg_data, marray, 1, flags, rtvals);
785 
786 	return rtvals[0];
787 }
788 
789 static inline int
790 const swap_pager_block_index(swp, offset)
791 	sw_pager_t swp;
792 	vm_offset_t offset;
793 {
794 	return (offset / (SWB_NPAGES*PAGE_SIZE));
795 }
796 
797 static inline int
798 const swap_pager_block_offset(swp, offset)
799 	sw_pager_t swp;
800 	vm_offset_t offset;
801 {
802 	return ((offset % (PAGE_SIZE*SWB_NPAGES)) / PAGE_SIZE);
803 }
804 
805 /*
806  * _swap_pager_haspage returns TRUE if the pager has data that has
807  * been written out.
808  */
809 static boolean_t
810 _swap_pager_haspage(swp, offset)
811 	sw_pager_t swp;
812 	vm_offset_t offset;
813 {
814 	register sw_blk_t swb;
815 	int ix;
816 
817 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
818 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
819 		return(FALSE);
820 	}
821 	swb = &swp->sw_blocks[ix];
822 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
823 	if (swb->swb_block[ix] != SWB_EMPTY) {
824 		if (swb->swb_valid & (1 << ix))
825 			return TRUE;
826 	}
827 
828 	return(FALSE);
829 }
830 
831 /*
832  * swap_pager_haspage is the externally accessible version of
833  * _swap_pager_haspage above.  this routine takes a vm_pager_t
834  * for an argument instead of sw_pager_t.
835  */
836 boolean_t
837 swap_pager_haspage(pager, offset)
838 	vm_pager_t pager;
839 	vm_offset_t offset;
840 {
841 	return _swap_pager_haspage((sw_pager_t) pager->pg_data, offset);
842 }
843 
844 /*
845  * swap_pager_freepage is a convienience routine that clears the busy
846  * bit and deallocates a page.
847  */
848 static void
849 swap_pager_freepage(m)
850 	vm_page_t m;
851 {
852 	PAGE_WAKEUP(m);
853 	vm_page_free(m);
854 }
855 
856 /*
857  * swap_pager_ridpages is a convienience routine that deallocates all
858  * but the required page.  this is usually used in error returns that
859  * need to invalidate the "extra" readahead pages.
860  */
861 static void
862 swap_pager_ridpages(m, count, reqpage)
863 	vm_page_t *m;
864 	int count;
865 	int reqpage;
866 {
867 	int i;
868 	for (i = 0; i < count; i++)
869 		if (i != reqpage)
870 			swap_pager_freepage(m[i]);
871 }
872 
873 int swapwritecount=0;
874 
875 /*
876  * swap_pager_iodone1 is the completion routine for both reads and async writes
877  */
878 void
879 swap_pager_iodone1(bp)
880 	struct buf *bp;
881 {
882 	bp->b_flags |= B_DONE;
883 	bp->b_flags &= ~B_ASYNC;
884 	wakeup((caddr_t)bp);
885 /*
886 	if ((bp->b_flags & B_READ) == 0)
887 		vwakeup(bp);
888 */
889 }
890 
891 
892 int
893 swap_pager_input(swp, m, count, reqpage)
894 	register sw_pager_t swp;
895 	vm_page_t *m;
896 	int count, reqpage;
897 {
898 	register struct buf *bp;
899 	sw_blk_t swb[count];
900 	register int s;
901 	int i;
902 	boolean_t rv;
903 	vm_offset_t kva, off[count];
904 	swp_clean_t spc;
905 	vm_offset_t paging_offset;
906 	vm_object_t object;
907 	int reqaddr[count];
908 
909 	int first, last;
910 	int failed;
911 	int reqdskregion;
912 
913 	object = m[reqpage]->object;
914 	paging_offset = object->paging_offset;
915 	/*
916 	 * First determine if the page exists in the pager if this is
917 	 * a sync read.  This quickly handles cases where we are
918 	 * following shadow chains looking for the top level object
919 	 * with the page.
920 	 */
921 	if (swp->sw_blocks == NULL) {
922 		swap_pager_ridpages(m, count, reqpage);
923 		return(VM_PAGER_FAIL);
924 	}
925 
926 	for(i = 0; i < count; i++) {
927 		vm_offset_t foff = m[i]->offset + paging_offset;
928 		int ix = swap_pager_block_index(swp, foff);
929 		if (ix >= swp->sw_nblocks) {
930 			int j;
931 			if( i <= reqpage) {
932 				swap_pager_ridpages(m, count, reqpage);
933 				return(VM_PAGER_FAIL);
934 			}
935 			for(j = i; j < count; j++) {
936 				swap_pager_freepage(m[j]);
937 			}
938 			count = i;
939 			break;
940 		}
941 
942 		swb[i] = &swp->sw_blocks[ix];
943 		off[i] = swap_pager_block_offset(swp, foff);
944 		reqaddr[i] = swb[i]->swb_block[off[i]];
945 	}
946 
947 	/* make sure that our required input request is existant */
948 
949 	if (reqaddr[reqpage] == SWB_EMPTY ||
950 		(swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
951 		swap_pager_ridpages(m, count, reqpage);
952 		return(VM_PAGER_FAIL);
953 	}
954 
955 
956 	reqdskregion = reqaddr[reqpage] / dmmax;
957 
958 	/*
959 	 * search backwards for the first contiguous page to transfer
960 	 */
961 	failed = 0;
962 	first = 0;
963 	for (i = reqpage - 1; i >= 0; --i) {
964 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
965 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
966 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
967 			((reqaddr[i] / dmmax) != reqdskregion)) {
968 				failed = 1;
969 				swap_pager_freepage(m[i]);
970 				if (first == 0)
971 					first = i + 1;
972 		}
973 	}
974 	/*
975 	 * search forwards for the last contiguous page to transfer
976 	 */
977 	failed = 0;
978 	last = count;
979 	for (i = reqpage + 1; i < count; i++) {
980 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
981 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
982 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
983 			((reqaddr[i] / dmmax) != reqdskregion)) {
984 				failed = 1;
985 				swap_pager_freepage(m[i]);
986 				if (last == count)
987 					last = i;
988 		}
989 	}
990 
991 	count = last;
992 	if (first != 0) {
993 		for (i = first; i < count; i++) {
994 			m[i-first] = m[i];
995 			reqaddr[i-first] = reqaddr[i];
996 			off[i-first] = off[i];
997 		}
998 		count -= first;
999 		reqpage -= first;
1000 	}
1001 
1002 	++swb[reqpage]->swb_locked;
1003 
1004 	/*
1005 	 * at this point:
1006 	 * "m" is a pointer to the array of vm_page_t for paging I/O
1007 	 * "count" is the number of vm_page_t entries represented by "m"
1008 	 * "object" is the vm_object_t for I/O
1009 	 * "reqpage" is the index into "m" for the page actually faulted
1010 	 */
1011 
1012 	spc = NULL;	/* we might not use an spc data structure */
1013 
1014 	if (count == 1) {
1015 		/*
1016 		 * if a kva has not been allocated, we can only do a one page transfer,
1017 		 * so we free the other pages that might have been allocated by
1018 		 * vm_fault.
1019 		 */
1020 		swap_pager_ridpages(m, count, reqpage);
1021 		m[0] = m[reqpage];
1022 		reqaddr[0] = reqaddr[reqpage];
1023 
1024 		count = 1;
1025 		reqpage = 0;
1026 	/*
1027 	 * get a swap pager clean data structure, block until we get it
1028 	 */
1029 		if (swap_pager_free.tqh_first == NULL) {
1030 			s = splbio();
1031 			if( curproc == pageproc)
1032 				(void) swap_pager_clean();
1033 			else
1034 				wakeup((caddr_t) &vm_pages_needed);
1035 			while (swap_pager_free.tqh_first == NULL) {
1036 				swap_pager_needflags |= SWAP_FREE_NEEDED;
1037 				tsleep((caddr_t)&swap_pager_free,
1038 					PVM, "swpfre", 0);
1039 				if( curproc == pageproc)
1040 					(void) swap_pager_clean();
1041 				else
1042 					wakeup((caddr_t) &vm_pages_needed);
1043 			}
1044 			splx(s);
1045 		}
1046 		spc = swap_pager_free.tqh_first;
1047 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1048 		kva = spc->spc_kva;
1049 		bp = spc->spc_bp;
1050 		bzero(bp, sizeof *bp);
1051 		bp->b_spc = spc;
1052 	} else {
1053 	/*
1054 	 * Get a swap buffer header to perform the IO
1055 	 */
1056 		bp = getpbuf();
1057 		kva = (vm_offset_t) bp->b_data;
1058 	}
1059 
1060 	/*
1061 	 * map our page(s) into kva for input
1062 	 */
1063 	pmap_qenter( kva, m, count);
1064 
1065 	s = splbio();
1066 	bp->b_flags = B_BUSY | B_READ | B_CALL;
1067 	bp->b_iodone = swap_pager_iodone1;
1068 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1069 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1070 	crhold(bp->b_rcred);
1071 	crhold(bp->b_wcred);
1072 	bp->b_un.b_addr = (caddr_t) kva;
1073 	bp->b_blkno = reqaddr[0];
1074 	bp->b_bcount = PAGE_SIZE*count;
1075 	bp->b_bufsize = PAGE_SIZE*count;
1076 
1077 	bgetvp( swapdev_vp, bp);
1078 
1079 	swp->sw_piip++;
1080 
1081 	/*
1082 	 * perform the I/O
1083 	 */
1084 	VOP_STRATEGY(bp);
1085 
1086 	/*
1087 	 * wait for the sync I/O to complete
1088 	 */
1089 	while ((bp->b_flags & B_DONE) == 0) {
1090 		tsleep((caddr_t)bp, PVM, "swread", 0);
1091 	}
1092 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1093 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1094 
1095 	--swp->sw_piip;
1096 	if (swp->sw_piip == 0)
1097 		wakeup((caddr_t) swp);
1098 
1099 	/*
1100 	 * relpbuf does this, but we maintain our own buffer
1101 	 * list also...
1102 	 */
1103 	if (bp->b_vp)
1104 		brelvp(bp);
1105 
1106 	splx(s);
1107 	--swb[reqpage]->swb_locked;
1108 
1109 	/*
1110 	 * remove the mapping for kernel virtual
1111 	 */
1112 	pmap_qremove( kva, count);
1113 
1114 	if (spc) {
1115 		/*
1116 		 * if we have used an spc, we need to free it.
1117 		 */
1118 		if( bp->b_rcred != NOCRED)
1119 			crfree(bp->b_rcred);
1120 		if( bp->b_wcred != NOCRED)
1121 			crfree(bp->b_wcred);
1122 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1123 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1124 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1125 			wakeup((caddr_t)&swap_pager_free);
1126 		}
1127 	} else {
1128 		/*
1129 		 * release the physical I/O buffer
1130 		 */
1131 		relpbuf(bp);
1132 		/*
1133 		 * finish up input if everything is ok
1134 		 */
1135 		if( rv == VM_PAGER_OK) {
1136 			for (i = 0; i < count; i++) {
1137 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1138 				m[i]->flags |= PG_CLEAN;
1139 				m[i]->flags &= ~PG_LAUNDRY;
1140 				if (i != reqpage) {
1141 					/*
1142 					 * whether or not to leave the page activated
1143 					 * is up in the air, but we should put the page
1144 					 * on a page queue somewhere. (it already is in
1145 					 * the object).
1146 					 * After some emperical results, it is best
1147 					 * to deactivate the readahead pages.
1148 					 */
1149 					vm_page_deactivate(m[i]);
1150 
1151 					/*
1152 					 * just in case someone was asking for this
1153 					 * page we now tell them that it is ok to use
1154 					 */
1155 					m[i]->flags &= ~PG_FAKE;
1156 					PAGE_WAKEUP(m[i]);
1157 				}
1158 			}
1159 			if( swap_pager_full) {
1160 				_swap_pager_freespace( swp, m[0]->offset+paging_offset, count*PAGE_SIZE);
1161 			}
1162 		} else {
1163 			swap_pager_ridpages(m, count, reqpage);
1164 		}
1165 	}
1166 	return(rv);
1167 }
1168 
1169 int
1170 swap_pager_output(swp, m, count, flags, rtvals)
1171 	register sw_pager_t swp;
1172 	vm_page_t *m;
1173 	int count;
1174 	int flags;
1175 	int *rtvals;
1176 {
1177 	register struct buf *bp;
1178 	sw_blk_t swb[count];
1179 	register int s;
1180 	int i, j, ix;
1181 	boolean_t rv;
1182 	vm_offset_t kva, off, foff;
1183 	swp_clean_t spc;
1184 	vm_offset_t paging_offset;
1185 	vm_object_t object;
1186 	int reqaddr[count];
1187 	int failed;
1188 
1189 /*
1190 	if( count > 1)
1191 		printf("off: 0x%x, count: %d\n", m[0]->offset, count);
1192 */
1193 	spc = NULL;
1194 
1195 	object = m[0]->object;
1196 	paging_offset = object->paging_offset;
1197 
1198 	failed = 0;
1199 	for(j=0;j<count;j++) {
1200 		foff = m[j]->offset + paging_offset;
1201 		ix = swap_pager_block_index(swp, foff);
1202 		swb[j] = 0;
1203 		if( swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
1204 			rtvals[j] = VM_PAGER_FAIL;
1205 			failed = 1;
1206 			continue;
1207 		} else {
1208 			rtvals[j] = VM_PAGER_OK;
1209 		}
1210 		swb[j] = &swp->sw_blocks[ix];
1211 		++swb[j]->swb_locked;
1212 		if( failed) {
1213 			rtvals[j] = VM_PAGER_FAIL;
1214 			continue;
1215 		}
1216 		off = swap_pager_block_offset(swp, foff);
1217 		reqaddr[j] = swb[j]->swb_block[off];
1218 		if( reqaddr[j] == SWB_EMPTY) {
1219 			int blk;
1220 			int tries;
1221 			int ntoget;
1222 			tries = 0;
1223 			s = splbio();
1224 
1225 			/*
1226 			 * if any other pages have been allocated in this block, we
1227 			 * only try to get one page.
1228 			 */
1229 			for (i = 0; i < SWB_NPAGES; i++) {
1230 				if (swb[j]->swb_block[i] != SWB_EMPTY)
1231 					break;
1232 			}
1233 
1234 
1235 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
1236 			/*
1237 			 * this code is alittle conservative, but works
1238 			 * (the intent of this code is to allocate small chunks
1239 			 *  for small objects)
1240 			 */
1241 			if( (m[j]->offset == 0) && (ntoget*PAGE_SIZE > object->size)) {
1242 				ntoget = (object->size + (PAGE_SIZE-1))/PAGE_SIZE;
1243 			}
1244 
1245 retrygetspace:
1246 			if (!swap_pager_full && ntoget > 1 &&
1247 				swap_pager_getswapspace(ntoget * btodb(PAGE_SIZE), &blk)) {
1248 
1249 				for (i = 0; i < ntoget; i++) {
1250 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
1251 					swb[j]->swb_valid = 0;
1252 				}
1253 
1254 				reqaddr[j] = swb[j]->swb_block[off];
1255 			} else if (!swap_pager_getswapspace(btodb(PAGE_SIZE),
1256 				&swb[j]->swb_block[off])) {
1257 				/*
1258 				 * if the allocation has failed, we try to reclaim space and
1259 				 * retry.
1260 				 */
1261 				if (++tries == 1) {
1262 					swap_pager_reclaim();
1263 					goto retrygetspace;
1264 				}
1265 				rtvals[j] = VM_PAGER_AGAIN;
1266 				failed = 1;
1267 			} else {
1268 				reqaddr[j] = swb[j]->swb_block[off];
1269 				swb[j]->swb_valid &= ~(1<<off);
1270 			}
1271 			splx(s);
1272 		}
1273 	}
1274 
1275 	/*
1276 	 * search forwards for the last contiguous page to transfer
1277 	 */
1278 	failed = 0;
1279 	for (i = 0; i < count; i++) {
1280 		if( failed || (reqaddr[i] != reqaddr[0] + i*btodb(PAGE_SIZE)) ||
1281 			(reqaddr[i] / dmmax) != (reqaddr[0] / dmmax) ||
1282 			(rtvals[i] != VM_PAGER_OK)) {
1283 			failed = 1;
1284 			if( rtvals[i] == VM_PAGER_OK)
1285 				rtvals[i] = VM_PAGER_AGAIN;
1286 		}
1287 	}
1288 
1289 	for(i = 0; i < count; i++) {
1290 		if( rtvals[i] != VM_PAGER_OK) {
1291 			if( swb[i])
1292 				--swb[i]->swb_locked;
1293 		}
1294 	}
1295 
1296 	for(i = 0; i < count; i++)
1297 		if( rtvals[i] != VM_PAGER_OK)
1298 			break;
1299 
1300 	if( i == 0) {
1301 		return VM_PAGER_AGAIN;
1302 	}
1303 
1304 	count = i;
1305 	for(i=0;i<count;i++) {
1306 		if( reqaddr[i] == SWB_EMPTY)
1307 			printf("I/O to empty block????\n");
1308 	}
1309 
1310 	/*
1311 	 */
1312 
1313 	/*
1314 	 * For synchronous writes, we clean up
1315 	 * all completed async pageouts.
1316 	 */
1317 	if ((flags & B_ASYNC) == 0) {
1318 		swap_pager_clean();
1319 	}
1320 
1321 	kva = 0;
1322 
1323 	/*
1324 	 * we allocate a new kva for transfers > 1 page
1325 	 * but for transfers == 1 page, the swap_pager_free list contains
1326 	 * entries that have pre-allocated kva's (for efficiency).
1327 	 * NOTE -- we do not use the physical buffer pool or the
1328 	 * preallocated associated kva's because of the potential for
1329 	 * deadlock.  This is very subtile -- but deadlocks or resource
1330 	 * contention must be avoided on pageouts -- or your system will
1331 	 * sleep (forever) !!!
1332 	 */
1333 	if ( count > 1) {
1334 		kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE);
1335 		if( !kva) {
1336 			for (i = 0; i < count; i++) {
1337 				if( swb[i])
1338 					--swb[i]->swb_locked;
1339 				rtvals[i] = VM_PAGER_AGAIN;
1340 			}
1341 			return VM_PAGER_AGAIN;
1342 		}
1343 	}
1344 
1345 	/*
1346 	 * get a swap pager clean data structure, block until we get it
1347 	 */
1348 	if (swap_pager_free.tqh_first == NULL) {
1349 		s = splbio();
1350 		if( curproc == pageproc)
1351 			(void) swap_pager_clean();
1352 		else
1353 			wakeup((caddr_t) &vm_pages_needed);
1354 		while (swap_pager_free.tqh_first == NULL) {
1355 			swap_pager_needflags |= SWAP_FREE_NEEDED;
1356 			tsleep((caddr_t)&swap_pager_free,
1357 				PVM, "swpfre", 0);
1358 			if( curproc == pageproc)
1359 				(void) swap_pager_clean();
1360 			else
1361 				wakeup((caddr_t) &vm_pages_needed);
1362 		}
1363 		splx(s);
1364 	}
1365 
1366 	spc = swap_pager_free.tqh_first;
1367 	TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1368 	if( !kva) {
1369 		kva = spc->spc_kva;
1370 		spc->spc_altkva = 0;
1371 	} else {
1372 		spc->spc_altkva = kva;
1373 	}
1374 
1375 	/*
1376 	 * map our page(s) into kva for I/O
1377 	 */
1378 	pmap_qenter(kva, m, count);
1379 
1380 	/*
1381 	 * get the base I/O offset into the swap file
1382 	 */
1383 	for(i=0;i<count;i++) {
1384 		foff = m[i]->offset + paging_offset;
1385 		off = swap_pager_block_offset(swp, foff);
1386 		/*
1387 		 * if we are setting the valid bit anew,
1388 		 * then diminish the swap free space
1389 		 */
1390 		if( (swb[i]->swb_valid & (1 << off)) == 0)
1391 			vm_swap_size -= btodb(PAGE_SIZE);
1392 
1393 		/*
1394 		 * set the valid bit
1395 		 */
1396 		swb[i]->swb_valid |= (1 << off);
1397 		/*
1398 		 * and unlock the data structure
1399 		 */
1400 		--swb[i]->swb_locked;
1401 	}
1402 
1403 	s = splbio();
1404 	/*
1405 	 * Get a swap buffer header and perform the IO
1406 	 */
1407 	bp = spc->spc_bp;
1408 	bzero(bp, sizeof *bp);
1409 	bp->b_spc = spc;
1410 
1411 	bp->b_flags = B_BUSY;
1412 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1413 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1414 	if( bp->b_rcred != NOCRED)
1415 		crhold(bp->b_rcred);
1416 	if( bp->b_wcred != NOCRED)
1417 		crhold(bp->b_wcred);
1418 	bp->b_data = (caddr_t) kva;
1419 	bp->b_blkno = reqaddr[0];
1420 	bgetvp( swapdev_vp, bp);
1421 
1422 	bp->b_bcount = PAGE_SIZE*count;
1423 	bp->b_bufsize = PAGE_SIZE*count;
1424 	swapdev_vp->v_numoutput++;
1425 
1426 	/*
1427 	 * If this is an async write we set up additional buffer fields
1428 	 * and place a "cleaning" entry on the inuse queue.
1429 	 */
1430 	if ( flags & B_ASYNC ) {
1431 		spc->spc_flags = 0;
1432 		spc->spc_swp = swp;
1433 		for(i=0;i<count;i++)
1434 			spc->spc_m[i] = m[i];
1435 		spc->spc_count = count;
1436 		/*
1437 		 * the completion routine for async writes
1438 		 */
1439 		bp->b_flags |= B_CALL;
1440 		bp->b_iodone = swap_pager_iodone;
1441 		bp->b_dirtyoff = 0;
1442 		bp->b_dirtyend = bp->b_bcount;
1443 		swp->sw_poip++;
1444 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
1445 	} else {
1446 		swp->sw_poip++;
1447 		bp->b_flags |= B_CALL;
1448 		bp->b_iodone = swap_pager_iodone1;
1449 	}
1450 	/*
1451 	 * perform the I/O
1452 	 */
1453 	VOP_STRATEGY(bp);
1454 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC ) {
1455 		if ((bp->b_flags & B_DONE) == B_DONE) {
1456 			swap_pager_clean();
1457 		}
1458 		splx(s);
1459 		for(i=0;i<count;i++) {
1460 			rtvals[i] = VM_PAGER_PEND;
1461 		}
1462 		return VM_PAGER_PEND;
1463 	}
1464 
1465 	/*
1466 	 * wait for the sync I/O to complete
1467 	 */
1468 	while ((bp->b_flags & B_DONE) == 0) {
1469 		tsleep((caddr_t)bp, PVM, "swwrt", 0);
1470 	}
1471 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1472 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1473 
1474 	--swp->sw_poip;
1475 	if (swp->sw_poip == 0)
1476 		wakeup((caddr_t) swp);
1477 
1478 	if (bp->b_vp)
1479 		brelvp(bp);
1480 
1481 	splx(s);
1482 
1483 	/*
1484 	 * remove the mapping for kernel virtual
1485 	 */
1486 	pmap_qremove( kva, count);
1487 
1488 	/*
1489 	 * if we have written the page, then indicate that the page
1490 	 * is clean.
1491 	 */
1492 	if (rv == VM_PAGER_OK) {
1493 		for(i=0;i<count;i++) {
1494 			if( rtvals[i] == VM_PAGER_OK) {
1495 				m[i]->flags |= PG_CLEAN;
1496 				m[i]->flags &= ~PG_LAUNDRY;
1497 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1498 				/*
1499 				 * optimization, if a page has been read during the
1500 				 * pageout process, we activate it.
1501 				 */
1502 				if ( (m[i]->flags & PG_ACTIVE) == 0 &&
1503 					pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))
1504 					vm_page_activate(m[i]);
1505 			}
1506 		}
1507 	} else {
1508 		for(i=0;i<count;i++) {
1509 			rtvals[i] = rv;
1510 			m[i]->flags |= PG_LAUNDRY;
1511 		}
1512 	}
1513 
1514 	if( spc->spc_altkva)
1515 		kmem_free_wakeup(pager_map, kva, count * PAGE_SIZE);
1516 
1517 	if( bp->b_rcred != NOCRED)
1518 		crfree(bp->b_rcred);
1519 	if( bp->b_wcred != NOCRED)
1520 		crfree(bp->b_wcred);
1521 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1522 	if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1523 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1524 		wakeup((caddr_t)&swap_pager_free);
1525 	}
1526 
1527 	return(rv);
1528 }
1529 
1530 boolean_t
1531 swap_pager_clean()
1532 {
1533 	register swp_clean_t spc, tspc;
1534 	register int s;
1535 
1536 	tspc = NULL;
1537 	if (swap_pager_done.tqh_first == NULL)
1538 		return FALSE;
1539 	for (;;) {
1540 		s = splbio();
1541 		/*
1542 		 * Look up and removal from done list must be done
1543 		 * at splbio() to avoid conflicts with swap_pager_iodone.
1544 		 */
1545 		while (spc = swap_pager_done.tqh_first) {
1546 			if( spc->spc_altkva) {
1547 				pmap_qremove( spc->spc_altkva, spc->spc_count);
1548 				kmem_free_wakeup(pager_map, spc->spc_altkva, spc->spc_count * PAGE_SIZE);
1549 				spc->spc_altkva = 0;
1550 			} else {
1551 				pmap_qremove( spc->spc_kva, 1);
1552 			}
1553 			swap_pager_finish(spc);
1554 			TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
1555 			goto doclean;
1556 		}
1557 
1558 		/*
1559 		 * No operations done, thats all we can do for now.
1560 		 */
1561 
1562 		splx(s);
1563 		break;
1564 
1565 		/*
1566 		 * The desired page was found to be busy earlier in
1567 		 * the scan but has since completed.
1568 		 */
1569 doclean:
1570 		if (tspc && tspc == spc) {
1571 			tspc = NULL;
1572 		}
1573 		spc->spc_flags = 0;
1574 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1575 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1576 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1577 			wakeup((caddr_t)&swap_pager_free);
1578 		}
1579 		++cleandone;
1580 		splx(s);
1581 	}
1582 
1583 	return(tspc ? TRUE : FALSE);
1584 }
1585 
1586 void
1587 swap_pager_finish(spc)
1588 	register swp_clean_t spc;
1589 {
1590 	vm_object_t object = spc->spc_m[0]->object;
1591 	int i;
1592 
1593 	if ((object->paging_in_progress -= spc->spc_count) == 0)
1594 		thread_wakeup((int) object);
1595 
1596 	/*
1597 	 * If no error mark as clean and inform the pmap system.
1598 	 * If error, mark as dirty so we will try again.
1599 	 * (XXX could get stuck doing this, should give up after awhile)
1600 	 */
1601 	if (spc->spc_flags & SPC_ERROR) {
1602 		for(i=0;i<spc->spc_count;i++) {
1603 			printf("swap_pager_finish: clean of page %x failed\n",
1604 			       VM_PAGE_TO_PHYS(spc->spc_m[i]));
1605 			spc->spc_m[i]->flags |= PG_LAUNDRY;
1606 		}
1607 	} else {
1608 		for(i=0;i<spc->spc_count;i++) {
1609 			pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i]));
1610 			spc->spc_m[i]->flags |= PG_CLEAN;
1611 		}
1612 	}
1613 
1614 
1615 	for(i=0;i<spc->spc_count;i++) {
1616 		/*
1617 		 * we wakeup any processes that are waiting on
1618 		 * these pages.
1619 		 */
1620 		PAGE_WAKEUP(spc->spc_m[i]);
1621 	}
1622 	nswiodone -= spc->spc_count;
1623 
1624 	return;
1625 }
1626 
1627 /*
1628  * swap_pager_iodone
1629  */
1630 void
1631 swap_pager_iodone(bp)
1632 	register struct buf *bp;
1633 {
1634 	register swp_clean_t spc;
1635 	int s;
1636 
1637 	s = splbio();
1638 	spc = (swp_clean_t) bp->b_spc;
1639 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
1640 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
1641 	if (bp->b_flags & B_ERROR) {
1642 		spc->spc_flags |= SPC_ERROR;
1643 		printf("error %d blkno %d sz %d ",
1644 			bp->b_error, bp->b_blkno, bp->b_bcount);
1645 	}
1646 
1647 /*
1648 	if ((bp->b_flags & B_READ) == 0)
1649 		vwakeup(bp);
1650 */
1651 
1652 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_ASYNC);
1653 	if (bp->b_vp) {
1654 		brelvp(bp);
1655 	}
1656 	if( bp->b_rcred != NOCRED)
1657 		crfree(bp->b_rcred);
1658 	if( bp->b_wcred != NOCRED)
1659 		crfree(bp->b_wcred);
1660 
1661 	nswiodone += spc->spc_count;
1662 	if (--spc->spc_swp->sw_poip == 0) {
1663 		wakeup((caddr_t)spc->spc_swp);
1664 	}
1665 
1666 	if ((swap_pager_needflags & SWAP_FREE_NEEDED) ||
1667 	    swap_pager_inuse.tqh_first == 0) {
1668 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1669 		wakeup((caddr_t)&swap_pager_free);
1670 		wakeup((caddr_t)&vm_pages_needed);
1671 	}
1672 
1673 	if (vm_pageout_pages_needed) {
1674 		wakeup((caddr_t)&vm_pageout_pages_needed);
1675 	}
1676 
1677 	if ((swap_pager_inuse.tqh_first == NULL) ||
1678 	    (cnt.v_free_count < cnt.v_free_min &&
1679 	    nswiodone + cnt.v_free_count >= cnt.v_free_min) ) {
1680 		wakeup((caddr_t)&vm_pages_needed);
1681 	}
1682 	splx(s);
1683 }
1684 
1685 /*
1686  * return true if any swap control structures can be allocated
1687  */
1688 int
1689 swap_pager_ready() {
1690 	if( swap_pager_free.tqh_first)
1691 		return 1;
1692 	else
1693 		return 0;
1694 }
1695