xref: /freebsd/sys/vm/swap_pager.c (revision 976e77fc9c178ec1373d3e867d3d35d3005c64cd)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * Copyright (c) 1990 University of Utah.
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * the Systems Programming Group of the University of Utah Computer
9  * Science Department.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
40  *
41  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
42  * $Id: swap_pager.c,v 1.13 1994/10/14 12:26:17 davidg Exp $
43  */
44 
45 /*
46  * Quick hack to page to dedicated partition(s).
47  * TODO:
48  *	Add multiprocessor locks
49  *	Deal with async writes in a better fashion
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/proc.h>
55 #include <sys/buf.h>
56 #include <sys/vnode.h>
57 #include <sys/malloc.h>
58 
59 #include <miscfs/specfs/specdev.h>
60 #include <sys/rlist.h>
61 
62 #include <vm/vm.h>
63 #include <vm/vm_pager.h>
64 #include <vm/vm_page.h>
65 #include <vm/vm_pageout.h>
66 #include <vm/swap_pager.h>
67 
68 #ifndef NPENDINGIO
69 #define NPENDINGIO	16
70 #endif
71 
72 int	swap_pager_input __P((sw_pager_t, vm_page_t *, int, int));
73 int	swap_pager_output __P((sw_pager_t, vm_page_t *, int, int, int *));
74 
75 int nswiodone;
76 extern int vm_pageout_rate_limit;
77 static int cleandone;
78 extern int hz;
79 int swap_pager_full;
80 extern vm_map_t pager_map;
81 extern int vm_swap_size;
82 struct rlist *swaplist;
83 int nswaplist;
84 
85 #define MAX_PAGEOUT_CLUSTER 8
86 
87 TAILQ_HEAD(swpclean, swpagerclean);
88 
89 typedef	struct swpagerclean	*swp_clean_t;
90 
91 struct swpagerclean {
92 	TAILQ_ENTRY(swpagerclean)	spc_list;
93 	int				spc_flags;
94 	struct buf			*spc_bp;
95 	sw_pager_t			spc_swp;
96 	vm_offset_t			spc_kva;
97 	int				spc_count;
98 	vm_page_t			spc_m[MAX_PAGEOUT_CLUSTER];
99 } swcleanlist [NPENDINGIO] ;
100 
101 
102 extern vm_map_t kernel_map;
103 
104 /* spc_flags values */
105 #define SPC_ERROR	0x01
106 
107 #define SWB_EMPTY (-1)
108 
109 struct swpclean swap_pager_done;	/* list of compileted page cleans */
110 struct swpclean swap_pager_inuse;	/* list of pending page cleans */
111 struct swpclean swap_pager_free;	/* list of free pager clean structs */
112 struct pagerlst swap_pager_list;	/* list of "named" anon regions */
113 struct pagerlst swap_pager_un_list;	/* list of "unnamed" anon pagers */
114 
115 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
116 int swap_pager_needflags;
117 struct rlist *swapfrag;
118 
119 struct pagerlst *swp_qs[]={
120 	&swap_pager_list, &swap_pager_un_list, (struct pagerlst *) 0
121 };
122 
123 int swap_pager_putmulti();
124 
125 struct pagerops swappagerops = {
126 	swap_pager_init,
127 	swap_pager_alloc,
128 	swap_pager_dealloc,
129 	swap_pager_getpage,
130 	swap_pager_getmulti,
131 	swap_pager_putpage,
132 	swap_pager_putmulti,
133 	swap_pager_haspage
134 };
135 
136 int npendingio = NPENDINGIO;
137 int pendingiowait;
138 int require_swap_init;
139 void swap_pager_finish();
140 int dmmin, dmmax;
141 extern int vm_page_count;
142 
143 static inline void swapsizecheck() {
144 	if( vm_swap_size < 128*btodb(PAGE_SIZE)) {
145 		if( swap_pager_full)
146 			printf("swap_pager: out of space\n");
147 		swap_pager_full = 1;
148 	} else if( vm_swap_size > 192*btodb(PAGE_SIZE))
149 		swap_pager_full = 0;
150 }
151 
152 void
153 swap_pager_init()
154 {
155 	dfltpagerops = &swappagerops;
156 
157 	TAILQ_INIT(&swap_pager_list);
158 	TAILQ_INIT(&swap_pager_un_list);
159 
160 	/*
161 	 * Initialize clean lists
162 	 */
163 	TAILQ_INIT(&swap_pager_inuse);
164 	TAILQ_INIT(&swap_pager_done);
165 	TAILQ_INIT(&swap_pager_free);
166 
167 	require_swap_init = 1;
168 
169 	/*
170 	 * Calculate the swap allocation constants.
171 	 */
172 
173 	dmmin = CLBYTES/DEV_BSIZE;
174 	dmmax = btodb(SWB_NPAGES*PAGE_SIZE)*2;
175 
176 }
177 
178 /*
179  * Allocate a pager structure and associated resources.
180  * Note that if we are called from the pageout daemon (handle == NULL)
181  * we should not wait for memory as it could resulting in deadlock.
182  */
183 vm_pager_t
184 swap_pager_alloc(handle, size, prot, offset)
185 	caddr_t handle;
186 	register vm_size_t size;
187 	vm_prot_t prot;
188 	vm_offset_t offset;
189 {
190 	register vm_pager_t pager;
191 	register sw_pager_t swp;
192 	int waitok;
193 	int i,j;
194 
195 	if (require_swap_init) {
196 		swp_clean_t spc;
197 		struct buf *bp;
198 		/*
199 		 * kva's are allocated here so that we dont need to keep
200 		 * doing kmem_alloc pageables at runtime
201 		 */
202 		for (i = 0, spc = swcleanlist; i < npendingio ; i++, spc++) {
203 			spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE*MAX_PAGEOUT_CLUSTER);
204 			if (!spc->spc_kva) {
205 				break;
206 			}
207 			spc->spc_bp = malloc( sizeof( *bp), M_TEMP, M_NOWAIT);
208 			if (!spc->spc_bp) {
209 				kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
210 				break;
211 			}
212 			spc->spc_flags = 0;
213 			TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
214 		}
215 		require_swap_init = 0;
216 		if( size == 0)
217 			return(NULL);
218 	}
219 
220 	/*
221 	 * If this is a "named" anonymous region, look it up and
222 	 * return the appropriate pager if it exists.
223 	 */
224 	if (handle) {
225 		pager = vm_pager_lookup(&swap_pager_list, handle);
226 		if (pager != NULL) {
227 			/*
228 			 * Use vm_object_lookup to gain a reference
229 			 * to the object and also to remove from the
230 			 * object cache.
231 			 */
232 			if (vm_object_lookup(pager) == NULL)
233 				panic("swap_pager_alloc: bad object");
234 			return(pager);
235 		}
236 	}
237 
238 	if (swap_pager_full) {
239 		return(NULL);
240 	}
241 
242 	/*
243 	 * Pager doesn't exist, allocate swap management resources
244 	 * and initialize.
245 	 */
246 	waitok = handle ? M_WAITOK : M_NOWAIT;
247 	pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok);
248 	if (pager == NULL)
249 		return(NULL);
250 	swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok);
251 	if (swp == NULL) {
252 		free((caddr_t)pager, M_VMPAGER);
253 		return(NULL);
254 	}
255 	size = round_page(size);
256 	swp->sw_osize = size;
257 	swp->sw_nblocks = (btodb(size) + btodb(SWB_NPAGES * PAGE_SIZE) - 1) / btodb(SWB_NPAGES*PAGE_SIZE);
258 	swp->sw_blocks = (sw_blk_t)
259 		malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks),
260 		       M_VMPGDATA, waitok);
261 	if (swp->sw_blocks == NULL) {
262 		free((caddr_t)swp, M_VMPGDATA);
263 		free((caddr_t)pager, M_VMPAGER);
264 		return(NULL);
265 	}
266 
267 	for (i = 0; i < swp->sw_nblocks; i++) {
268 		swp->sw_blocks[i].swb_valid = 0;
269 		swp->sw_blocks[i].swb_locked = 0;
270 		for (j = 0; j < SWB_NPAGES; j++)
271 			swp->sw_blocks[i].swb_block[j] = SWB_EMPTY;
272 	}
273 
274 	swp->sw_poip = 0;
275 	if (handle) {
276 		vm_object_t object;
277 
278 		swp->sw_flags = SW_NAMED;
279 		TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list);
280 		/*
281 		 * Consistant with other pagers: return with object
282 		 * referenced.  Can't do this with handle == NULL
283 		 * since it might be the pageout daemon calling.
284 		 */
285 		object = vm_object_allocate(size);
286 		vm_object_enter(object, pager);
287 		vm_object_setpager(object, pager, 0, FALSE);
288 	} else {
289 		swp->sw_flags = 0;
290 		TAILQ_INSERT_TAIL(&swap_pager_un_list, pager, pg_list);
291 	}
292 	pager->pg_handle = handle;
293 	pager->pg_ops = &swappagerops;
294 	pager->pg_type = PG_SWAP;
295 	pager->pg_data = (caddr_t)swp;
296 
297 	return(pager);
298 }
299 
300 /*
301  * returns disk block associated with pager and offset
302  * additionally, as a side effect returns a flag indicating
303  * if the block has been written
304  */
305 
306 static int *
307 swap_pager_diskaddr(swp, offset, valid)
308 	sw_pager_t swp;
309 	vm_offset_t offset;
310 	int *valid;
311 {
312 	register sw_blk_t swb;
313 	int ix;
314 
315 	if (valid)
316 		*valid = 0;
317 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
318 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
319 		return(FALSE);
320 	}
321 	swb = &swp->sw_blocks[ix];
322 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
323 	if (valid)
324 		*valid = swb->swb_valid & (1<<ix);
325 	return &swb->swb_block[ix];
326 }
327 
328 /*
329  * Utility routine to set the valid (written) bit for
330  * a block associated with a pager and offset
331  */
332 static void
333 swap_pager_setvalid(swp, offset, valid)
334 	sw_pager_t swp;
335 	vm_offset_t offset;
336 	int valid;
337 {
338 	register sw_blk_t swb;
339 	int ix;
340 
341 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
342 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks)
343 		return;
344 
345 	swb = &swp->sw_blocks[ix];
346 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
347 	if (valid)
348 		swb->swb_valid |= (1 << ix);
349 	else
350 		swb->swb_valid &= ~(1 << ix);
351 	return;
352 }
353 
354 /*
355  * this routine allocates swap space with a fragmentation
356  * minimization policy.
357  */
358 int
359 swap_pager_getswapspace( unsigned amount, unsigned *rtval) {
360 #ifdef EXP
361 	unsigned tmpalloc;
362 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
363 	if( amount < nblocksfrag) {
364 		if( rlist_alloc(&swapfrag, amount, rtval))
365 			return 1;
366 		if( !rlist_alloc(&swaplist, nblocksfrag, &tmpalloc))
367 			return 0;
368 		rlist_free( &swapfrag, tmpalloc+amount, tmpalloc + nblocksfrag - 1);
369 		*rtval = tmpalloc;
370 		return 1;
371 	}
372 #endif
373 	if( !rlist_alloc(&swaplist, amount, rtval))
374 		return 0;
375 	else
376 		return 1;
377 }
378 
379 /*
380  * this routine frees swap space with a fragmentation
381  * minimization policy.
382  */
383 void
384 swap_pager_freeswapspace( unsigned from, unsigned to) {
385 #ifdef EXP
386 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
387 	unsigned tmpalloc;
388 	if( ((to + 1) - from) >= nblocksfrag) {
389 #endif
390 		rlist_free(&swaplist, from, to);
391 #ifdef EXP
392 		return;
393 	}
394 	rlist_free(&swapfrag, from, to);
395 	while( rlist_alloc(&swapfrag, nblocksfrag, &tmpalloc)) {
396 		rlist_free(&swaplist, tmpalloc, tmpalloc + nblocksfrag-1);
397 	}
398 #endif
399 }
400 /*
401  * this routine frees swap blocks from a specified pager
402  */
403 void
404 _swap_pager_freespace(swp, start, size)
405 	sw_pager_t swp;
406 	vm_offset_t start;
407 	vm_offset_t size;
408 {
409 	vm_offset_t i;
410 	int s;
411 
412 	s = splbio();
413 	for (i = start; i < round_page(start + size - 1); i += PAGE_SIZE) {
414 		int valid;
415 		int *addr = swap_pager_diskaddr(swp, i, &valid);
416 		if (addr && *addr != SWB_EMPTY) {
417 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
418 			if( valid) {
419 				vm_swap_size += btodb(PAGE_SIZE);
420 				swap_pager_setvalid(swp, i, 0);
421 			}
422 			*addr = SWB_EMPTY;
423 		}
424 	}
425 	swapsizecheck();
426 	splx(s);
427 }
428 
429 void
430 swap_pager_freespace(pager, start, size)
431 	vm_pager_t pager;
432 	vm_offset_t start;
433 	vm_offset_t size;
434 {
435 	_swap_pager_freespace((sw_pager_t) pager->pg_data, start, size);
436 }
437 
438 /*
439  * swap_pager_reclaim frees up over-allocated space from all pagers
440  * this eliminates internal fragmentation due to allocation of space
441  * for segments that are never swapped to. It has been written so that
442  * it does not block until the rlist_free operation occurs; it keeps
443  * the queues consistant.
444  */
445 
446 /*
447  * Maximum number of blocks (pages) to reclaim per pass
448  */
449 #define MAXRECLAIM 256
450 
451 void
452 swap_pager_reclaim()
453 {
454 	vm_pager_t p;
455 	sw_pager_t swp;
456 	int i, j, k;
457 	int s;
458 	int reclaimcount;
459 	static int reclaims[MAXRECLAIM];
460 	static int in_reclaim;
461 
462 /*
463  * allow only one process to be in the swap_pager_reclaim subroutine
464  */
465 	s = splbio();
466 	if (in_reclaim) {
467 		tsleep((caddr_t) &in_reclaim, PSWP, "swrclm", 0);
468 		splx(s);
469 		return;
470 	}
471 	in_reclaim = 1;
472 	reclaimcount = 0;
473 
474 	/* for each pager queue */
475 	for (k = 0; swp_qs[k]; k++) {
476 
477 		p = swp_qs[k]->tqh_first;
478 		while (p && (reclaimcount < MAXRECLAIM)) {
479 
480 			/*
481 			 * see if any blocks associated with a pager has been
482 			 * allocated but not used (written)
483 			 */
484 			swp = (sw_pager_t) p->pg_data;
485 			for (i = 0; i < swp->sw_nblocks; i++) {
486 				sw_blk_t swb = &swp->sw_blocks[i];
487 				if( swb->swb_locked)
488 					continue;
489 				for (j = 0; j < SWB_NPAGES; j++) {
490 					if (swb->swb_block[j] != SWB_EMPTY &&
491 						(swb->swb_valid & (1 << j)) == 0) {
492 						reclaims[reclaimcount++] = swb->swb_block[j];
493 						swb->swb_block[j] = SWB_EMPTY;
494 						if (reclaimcount >= MAXRECLAIM)
495 							goto rfinished;
496 					}
497 				}
498 			}
499 			p = p->pg_list.tqe_next;
500 		}
501 	}
502 
503 rfinished:
504 
505 /*
506  * free the blocks that have been added to the reclaim list
507  */
508 	for (i = 0; i < reclaimcount; i++) {
509 		swap_pager_freeswapspace(reclaims[i], reclaims[i]+btodb(PAGE_SIZE) - 1);
510 		swapsizecheck();
511 		wakeup((caddr_t) &in_reclaim);
512 	}
513 
514 	splx(s);
515 	in_reclaim = 0;
516 	wakeup((caddr_t) &in_reclaim);
517 }
518 
519 
520 /*
521  * swap_pager_copy copies blocks from one pager to another and
522  * destroys the source pager
523  */
524 
525 void
526 swap_pager_copy(srcpager, srcoffset, dstpager, dstoffset, offset)
527 	vm_pager_t srcpager;
528 	vm_offset_t srcoffset;
529 	vm_pager_t dstpager;
530 	vm_offset_t dstoffset;
531 	vm_offset_t offset;
532 {
533 	sw_pager_t srcswp, dstswp;
534 	vm_offset_t i;
535 	int s;
536 
537 	srcswp = (sw_pager_t) srcpager->pg_data;
538 	dstswp = (sw_pager_t) dstpager->pg_data;
539 
540 /*
541  * remove the source pager from the swap_pager internal queue
542  */
543 	s = splbio();
544 	if (srcswp->sw_flags & SW_NAMED) {
545 		TAILQ_REMOVE(&swap_pager_list, srcpager, pg_list);
546 		srcswp->sw_flags &= ~SW_NAMED;
547 	} else {
548 		TAILQ_REMOVE(&swap_pager_un_list, srcpager, pg_list);
549 	}
550 
551 	while (srcswp->sw_poip) {
552 		tsleep((caddr_t)srcswp, PVM, "spgout", 0);
553 	}
554 	splx(s);
555 
556 /*
557  * clean all of the pages that are currently active and finished
558  */
559 	(void) swap_pager_clean();
560 
561 	s = splbio();
562 /*
563  * clear source block before destination object
564  * (release allocated space)
565  */
566 	for (i = 0; i < offset + srcoffset; i += PAGE_SIZE) {
567 		int valid;
568 		int *addr = swap_pager_diskaddr(srcswp, i, &valid);
569 		if (addr && *addr != SWB_EMPTY) {
570 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
571 			if( valid)
572 				vm_swap_size += btodb(PAGE_SIZE);
573 			swapsizecheck();
574 			*addr = SWB_EMPTY;
575 		}
576 	}
577 /*
578  * transfer source to destination
579  */
580 	for (i = 0; i < dstswp->sw_osize; i += PAGE_SIZE) {
581 		int srcvalid, dstvalid;
582 		int *srcaddrp = swap_pager_diskaddr(srcswp, i + offset + srcoffset,
583 			&srcvalid);
584 		int *dstaddrp;
585 	/*
586 	 * see if the source has space allocated
587 	 */
588 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
589 		/*
590 		 * if the source is valid and the dest has no space, then
591 		 * copy the allocation from the srouce to the dest.
592 		 */
593 			if (srcvalid) {
594 				dstaddrp = swap_pager_diskaddr(dstswp, i + dstoffset, &dstvalid);
595 				/*
596 				 * if the dest already has a valid block, deallocate the
597 				 * source block without copying.
598 				 */
599 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
600 					swap_pager_freeswapspace(*dstaddrp, *dstaddrp+btodb(PAGE_SIZE) - 1);
601 					*dstaddrp = SWB_EMPTY;
602 				}
603 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
604 					*dstaddrp = *srcaddrp;
605 					*srcaddrp = SWB_EMPTY;
606 					swap_pager_setvalid(dstswp, i + dstoffset, 1);
607 					vm_swap_size -= btodb(PAGE_SIZE);
608 				}
609 			}
610 		/*
611 		 * if the source is not empty at this point, then deallocate the space.
612 		 */
613 			if (*srcaddrp != SWB_EMPTY) {
614 				swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
615 				if( srcvalid)
616 					vm_swap_size += btodb(PAGE_SIZE);
617 				*srcaddrp = SWB_EMPTY;
618 			}
619 		}
620 	}
621 
622 /*
623  * deallocate the rest of the source object
624  */
625 	for (i = dstswp->sw_osize + offset + srcoffset; i < srcswp->sw_osize; i += PAGE_SIZE) {
626 		int valid;
627 		int *srcaddrp = swap_pager_diskaddr(srcswp, i, &valid);
628 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
629 			swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
630 			if( valid)
631 				vm_swap_size += btodb(PAGE_SIZE);
632 			*srcaddrp = SWB_EMPTY;
633 		}
634 	}
635 
636 	swapsizecheck();
637 	splx(s);
638 
639 	free((caddr_t)srcswp->sw_blocks, M_VMPGDATA);
640 	srcswp->sw_blocks = 0;
641 	free((caddr_t)srcswp, M_VMPGDATA);
642 	srcpager->pg_data = 0;
643 	free((caddr_t)srcpager, M_VMPAGER);
644 
645 	return;
646 }
647 
648 
649 void
650 swap_pager_dealloc(pager)
651 	vm_pager_t pager;
652 {
653 	register int i,j;
654 	register sw_blk_t bp;
655 	register sw_pager_t swp;
656 	int s;
657 
658 	/*
659 	 * Remove from list right away so lookups will fail if we
660 	 * block for pageout completion.
661 	 */
662 	s = splbio();
663 	swp = (sw_pager_t) pager->pg_data;
664 	if (swp->sw_flags & SW_NAMED) {
665 		TAILQ_REMOVE(&swap_pager_list, pager, pg_list);
666 		swp->sw_flags &= ~SW_NAMED;
667 	} else {
668 		TAILQ_REMOVE(&swap_pager_un_list, pager, pg_list);
669 	}
670 	/*
671 	 * Wait for all pageouts to finish and remove
672 	 * all entries from cleaning list.
673 	 */
674 
675 	while (swp->sw_poip) {
676 		tsleep((caddr_t)swp, PVM, "swpout", 0);
677 	}
678 	splx(s);
679 
680 
681 	(void) swap_pager_clean();
682 
683 	/*
684 	 * Free left over swap blocks
685 	 */
686 	s = splbio();
687 	for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) {
688 		for (j = 0; j < SWB_NPAGES; j++)
689 		if (bp->swb_block[j] != SWB_EMPTY) {
690 			swap_pager_freeswapspace((unsigned)bp->swb_block[j],
691 				(unsigned)bp->swb_block[j] + btodb(PAGE_SIZE) - 1);
692 			if( bp->swb_valid & (1<<j))
693 				vm_swap_size += btodb(PAGE_SIZE);
694 			bp->swb_block[j] = SWB_EMPTY;
695 		}
696 	}
697 	splx(s);
698 	swapsizecheck();
699 
700 	/*
701 	 * Free swap management resources
702 	 */
703 	free((caddr_t)swp->sw_blocks, M_VMPGDATA);
704 	swp->sw_blocks = 0;
705 	free((caddr_t)swp, M_VMPGDATA);
706 	pager->pg_data = 0;
707 	free((caddr_t)pager, M_VMPAGER);
708 }
709 
710 /*
711  * swap_pager_getmulti can get multiple pages.
712  */
713 int
714 swap_pager_getmulti(pager, m, count, reqpage, sync)
715 	vm_pager_t pager;
716 	vm_page_t *m;
717 	int count;
718 	int reqpage;
719 	boolean_t sync;
720 {
721 	if( reqpage >= count)
722 		panic("swap_pager_getmulti: reqpage >= count\n");
723 	return swap_pager_input((sw_pager_t) pager->pg_data, m, count, reqpage);
724 }
725 
726 /*
727  * swap_pager_getpage gets individual pages
728  */
729 int
730 swap_pager_getpage(pager, m, sync)
731 	vm_pager_t pager;
732 	vm_page_t m;
733 	boolean_t sync;
734 {
735 	vm_page_t marray[1];
736 
737 	marray[0] = m;
738 	return swap_pager_input((sw_pager_t)pager->pg_data, marray, 1, 0);
739 }
740 
741 int
742 swap_pager_putmulti(pager, m, c, sync, rtvals)
743 	vm_pager_t pager;
744 	vm_page_t *m;
745 	int c;
746 	boolean_t sync;
747 	int *rtvals;
748 {
749 	int flags;
750 
751 	if (pager == NULL) {
752 		(void) swap_pager_clean();
753 		return VM_PAGER_OK;
754 	}
755 
756 	flags = B_WRITE;
757 	if (!sync)
758 		flags |= B_ASYNC;
759 
760 	return swap_pager_output((sw_pager_t)pager->pg_data, m, c, flags, rtvals);
761 }
762 
763 /*
764  * swap_pager_putpage writes individual pages
765  */
766 int
767 swap_pager_putpage(pager, m, sync)
768 	vm_pager_t pager;
769 	vm_page_t m;
770 	boolean_t sync;
771 {
772 	int flags;
773 	vm_page_t marray[1];
774 	int rtvals[1];
775 
776 
777 	if (pager == NULL) {
778 		(void) swap_pager_clean();
779 		return VM_PAGER_OK;
780 	}
781 
782 	marray[0] = m;
783 	flags = B_WRITE;
784 	if (!sync)
785 		flags |= B_ASYNC;
786 
787 	swap_pager_output((sw_pager_t)pager->pg_data, marray, 1, flags, rtvals);
788 
789 	return rtvals[0];
790 }
791 
792 static inline int
793 const swap_pager_block_index(swp, offset)
794 	sw_pager_t swp;
795 	vm_offset_t offset;
796 {
797 	return (offset / (SWB_NPAGES*PAGE_SIZE));
798 }
799 
800 static inline int
801 const swap_pager_block_offset(swp, offset)
802 	sw_pager_t swp;
803 	vm_offset_t offset;
804 {
805 	return ((offset % (PAGE_SIZE*SWB_NPAGES)) / PAGE_SIZE);
806 }
807 
808 /*
809  * _swap_pager_haspage returns TRUE if the pager has data that has
810  * been written out.
811  */
812 static boolean_t
813 _swap_pager_haspage(swp, offset)
814 	sw_pager_t swp;
815 	vm_offset_t offset;
816 {
817 	register sw_blk_t swb;
818 	int ix;
819 
820 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
821 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
822 		return(FALSE);
823 	}
824 	swb = &swp->sw_blocks[ix];
825 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
826 	if (swb->swb_block[ix] != SWB_EMPTY) {
827 		if (swb->swb_valid & (1 << ix))
828 			return TRUE;
829 	}
830 
831 	return(FALSE);
832 }
833 
834 /*
835  * swap_pager_haspage is the externally accessible version of
836  * _swap_pager_haspage above.  this routine takes a vm_pager_t
837  * for an argument instead of sw_pager_t.
838  */
839 boolean_t
840 swap_pager_haspage(pager, offset)
841 	vm_pager_t pager;
842 	vm_offset_t offset;
843 {
844 	return _swap_pager_haspage((sw_pager_t) pager->pg_data, offset);
845 }
846 
847 /*
848  * swap_pager_freepage is a convienience routine that clears the busy
849  * bit and deallocates a page.
850  */
851 static void
852 swap_pager_freepage(m)
853 	vm_page_t m;
854 {
855 	PAGE_WAKEUP(m);
856 	vm_page_free(m);
857 }
858 
859 /*
860  * swap_pager_ridpages is a convienience routine that deallocates all
861  * but the required page.  this is usually used in error returns that
862  * need to invalidate the "extra" readahead pages.
863  */
864 static void
865 swap_pager_ridpages(m, count, reqpage)
866 	vm_page_t *m;
867 	int count;
868 	int reqpage;
869 {
870 	int i;
871 	for (i = 0; i < count; i++)
872 		if (i != reqpage)
873 			swap_pager_freepage(m[i]);
874 }
875 
876 int swapwritecount=0;
877 
878 /*
879  * swap_pager_iodone1 is the completion routine for both reads and async writes
880  */
881 void
882 swap_pager_iodone1(bp)
883 	struct buf *bp;
884 {
885 	bp->b_flags |= B_DONE;
886 	bp->b_flags &= ~B_ASYNC;
887 	wakeup((caddr_t)bp);
888 /*
889 	if ((bp->b_flags & B_READ) == 0)
890 		vwakeup(bp);
891 */
892 }
893 
894 
895 int
896 swap_pager_input(swp, m, count, reqpage)
897 	register sw_pager_t swp;
898 	vm_page_t *m;
899 	int count, reqpage;
900 {
901 	register struct buf *bp;
902 	sw_blk_t swb[count];
903 	register int s;
904 	int i;
905 	boolean_t rv;
906 	vm_offset_t kva, off[count];
907 	swp_clean_t spc;
908 	vm_offset_t paging_offset;
909 	vm_object_t object;
910 	int reqaddr[count];
911 
912 	int first, last;
913 	int failed;
914 	int reqdskregion;
915 
916 	object = m[reqpage]->object;
917 	paging_offset = object->paging_offset;
918 	/*
919 	 * First determine if the page exists in the pager if this is
920 	 * a sync read.  This quickly handles cases where we are
921 	 * following shadow chains looking for the top level object
922 	 * with the page.
923 	 */
924 	if (swp->sw_blocks == NULL) {
925 		swap_pager_ridpages(m, count, reqpage);
926 		return(VM_PAGER_FAIL);
927 	}
928 
929 	for(i = 0; i < count; i++) {
930 		vm_offset_t foff = m[i]->offset + paging_offset;
931 		int ix = swap_pager_block_index(swp, foff);
932 		if (ix >= swp->sw_nblocks) {
933 			int j;
934 			if( i <= reqpage) {
935 				swap_pager_ridpages(m, count, reqpage);
936 				return(VM_PAGER_FAIL);
937 			}
938 			for(j = i; j < count; j++) {
939 				swap_pager_freepage(m[j]);
940 			}
941 			count = i;
942 			break;
943 		}
944 
945 		swb[i] = &swp->sw_blocks[ix];
946 		off[i] = swap_pager_block_offset(swp, foff);
947 		reqaddr[i] = swb[i]->swb_block[off[i]];
948 	}
949 
950 	/* make sure that our required input request is existant */
951 
952 	if (reqaddr[reqpage] == SWB_EMPTY ||
953 		(swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
954 		swap_pager_ridpages(m, count, reqpage);
955 		return(VM_PAGER_FAIL);
956 	}
957 
958 
959 	reqdskregion = reqaddr[reqpage] / dmmax;
960 
961 	/*
962 	 * search backwards for the first contiguous page to transfer
963 	 */
964 	failed = 0;
965 	first = 0;
966 	for (i = reqpage - 1; i >= 0; --i) {
967 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
968 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
969 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
970 			((reqaddr[i] / dmmax) != reqdskregion)) {
971 				failed = 1;
972 				swap_pager_freepage(m[i]);
973 				if (first == 0)
974 					first = i + 1;
975 		}
976 	}
977 	/*
978 	 * search forwards for the last contiguous page to transfer
979 	 */
980 	failed = 0;
981 	last = count;
982 	for (i = reqpage + 1; i < count; i++) {
983 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
984 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
985 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
986 			((reqaddr[i] / dmmax) != reqdskregion)) {
987 				failed = 1;
988 				swap_pager_freepage(m[i]);
989 				if (last == count)
990 					last = i;
991 		}
992 	}
993 
994 	count = last;
995 	if (first != 0) {
996 		for (i = first; i < count; i++) {
997 			m[i-first] = m[i];
998 			reqaddr[i-first] = reqaddr[i];
999 			off[i-first] = off[i];
1000 		}
1001 		count -= first;
1002 		reqpage -= first;
1003 	}
1004 
1005 	++swb[reqpage]->swb_locked;
1006 
1007 	/*
1008 	 * at this point:
1009 	 * "m" is a pointer to the array of vm_page_t for paging I/O
1010 	 * "count" is the number of vm_page_t entries represented by "m"
1011 	 * "object" is the vm_object_t for I/O
1012 	 * "reqpage" is the index into "m" for the page actually faulted
1013 	 */
1014 
1015 	spc = NULL;	/* we might not use an spc data structure */
1016 
1017 	if (count == 1) {
1018 		/*
1019 		 * if a kva has not been allocated, we can only do a one page transfer,
1020 		 * so we free the other pages that might have been allocated by
1021 		 * vm_fault.
1022 		 */
1023 		swap_pager_ridpages(m, count, reqpage);
1024 		m[0] = m[reqpage];
1025 		reqaddr[0] = reqaddr[reqpage];
1026 
1027 		count = 1;
1028 		reqpage = 0;
1029 	/*
1030 	 * get a swap pager clean data structure, block until we get it
1031 	 */
1032 		if (swap_pager_free.tqh_first == NULL) {
1033 			s = splbio();
1034 			if( curproc == pageproc)
1035 				(void) swap_pager_clean();
1036 			else
1037 				wakeup((caddr_t) &vm_pages_needed);
1038 			while (swap_pager_free.tqh_first == NULL) {
1039 				swap_pager_needflags |= SWAP_FREE_NEEDED;
1040 				tsleep((caddr_t)&swap_pager_free,
1041 					PVM, "swpfre", 0);
1042 				if( curproc == pageproc)
1043 					(void) swap_pager_clean();
1044 				else
1045 					wakeup((caddr_t) &vm_pages_needed);
1046 			}
1047 			splx(s);
1048 		}
1049 		spc = swap_pager_free.tqh_first;
1050 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1051 		kva = spc->spc_kva;
1052 		bp = spc->spc_bp;
1053 		bzero(bp, sizeof *bp);
1054 		bp->b_spc = spc;
1055 	} else {
1056 	/*
1057 	 * Get a swap buffer header to perform the IO
1058 	 */
1059 		bp = getpbuf();
1060 		kva = (vm_offset_t) bp->b_data;
1061 	}
1062 
1063 	/*
1064 	 * map our page(s) into kva for input
1065 	 */
1066 	pmap_qenter( kva, m, count);
1067 
1068 	s = splbio();
1069 	bp->b_flags = B_BUSY | B_READ | B_CALL;
1070 	bp->b_iodone = swap_pager_iodone1;
1071 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1072 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1073 	crhold(bp->b_rcred);
1074 	crhold(bp->b_wcred);
1075 	bp->b_un.b_addr = (caddr_t) kva;
1076 	bp->b_blkno = reqaddr[0];
1077 	bp->b_bcount = PAGE_SIZE*count;
1078 	bp->b_bufsize = PAGE_SIZE*count;
1079 
1080 	bgetvp( swapdev_vp, bp);
1081 
1082 	swp->sw_piip++;
1083 
1084 	cnt.v_swapin++;
1085 	cnt.v_swappgsin += count;
1086 	/*
1087 	 * perform the I/O
1088 	 */
1089 	VOP_STRATEGY(bp);
1090 
1091 	/*
1092 	 * wait for the sync I/O to complete
1093 	 */
1094 	while ((bp->b_flags & B_DONE) == 0) {
1095 		tsleep((caddr_t)bp, PVM, "swread", 0);
1096 	}
1097 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1098 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1099 
1100 	--swp->sw_piip;
1101 	if (swp->sw_piip == 0)
1102 		wakeup((caddr_t) swp);
1103 
1104 	/*
1105 	 * relpbuf does this, but we maintain our own buffer
1106 	 * list also...
1107 	 */
1108 	if (bp->b_vp)
1109 		brelvp(bp);
1110 
1111 	splx(s);
1112 	--swb[reqpage]->swb_locked;
1113 
1114 	/*
1115 	 * remove the mapping for kernel virtual
1116 	 */
1117 	pmap_qremove( kva, count);
1118 
1119 	if (spc) {
1120 		/*
1121 		 * if we have used an spc, we need to free it.
1122 		 */
1123 		if( bp->b_rcred != NOCRED)
1124 			crfree(bp->b_rcred);
1125 		if( bp->b_wcred != NOCRED)
1126 			crfree(bp->b_wcred);
1127 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1128 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1129 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1130 			wakeup((caddr_t)&swap_pager_free);
1131 		}
1132 	} else {
1133 		/*
1134 		 * release the physical I/O buffer
1135 		 */
1136 		relpbuf(bp);
1137 		/*
1138 		 * finish up input if everything is ok
1139 		 */
1140 		if( rv == VM_PAGER_OK) {
1141 			for (i = 0; i < count; i++) {
1142 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1143 				m[i]->flags |= PG_CLEAN;
1144 				m[i]->flags &= ~PG_LAUNDRY;
1145 				if (i != reqpage) {
1146 					/*
1147 					 * whether or not to leave the page activated
1148 					 * is up in the air, but we should put the page
1149 					 * on a page queue somewhere. (it already is in
1150 					 * the object).
1151 					 * After some emperical results, it is best
1152 					 * to deactivate the readahead pages.
1153 					 */
1154 					vm_page_deactivate(m[i]);
1155 
1156 					/*
1157 					 * just in case someone was asking for this
1158 					 * page we now tell them that it is ok to use
1159 					 */
1160 					m[i]->flags &= ~PG_FAKE;
1161 					PAGE_WAKEUP(m[i]);
1162 				}
1163 			}
1164 			/*
1165 			 * If we're out of swap space, then attempt to free
1166 			 * some whenever pages are brought in. We must clear
1167 			 * the clean flag so that the page contents will be
1168 			 * preserved.
1169 			 */
1170 			if (swap_pager_full) {
1171 				for (i = 0; i < count; i++) {
1172 					m[i]->flags &= ~PG_CLEAN;
1173 				}
1174 				_swap_pager_freespace( swp, m[0]->offset+paging_offset, count*PAGE_SIZE);
1175 			}
1176 		} else {
1177 			swap_pager_ridpages(m, count, reqpage);
1178 		}
1179 	}
1180 	return(rv);
1181 }
1182 
1183 int
1184 swap_pager_output(swp, m, count, flags, rtvals)
1185 	register sw_pager_t swp;
1186 	vm_page_t *m;
1187 	int count;
1188 	int flags;
1189 	int *rtvals;
1190 {
1191 	register struct buf *bp;
1192 	sw_blk_t swb[count];
1193 	register int s;
1194 	int i, j, ix;
1195 	boolean_t rv;
1196 	vm_offset_t kva, off, foff;
1197 	swp_clean_t spc;
1198 	vm_offset_t paging_offset;
1199 	vm_object_t object;
1200 	int reqaddr[count];
1201 	int failed;
1202 
1203 /*
1204 	if( count > 1)
1205 		printf("off: 0x%x, count: %d\n", m[0]->offset, count);
1206 */
1207 	spc = NULL;
1208 
1209 	object = m[0]->object;
1210 	paging_offset = object->paging_offset;
1211 
1212 	failed = 0;
1213 	for(j=0;j<count;j++) {
1214 		foff = m[j]->offset + paging_offset;
1215 		ix = swap_pager_block_index(swp, foff);
1216 		swb[j] = 0;
1217 		if( swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
1218 			rtvals[j] = VM_PAGER_FAIL;
1219 			failed = 1;
1220 			continue;
1221 		} else {
1222 			rtvals[j] = VM_PAGER_OK;
1223 		}
1224 		swb[j] = &swp->sw_blocks[ix];
1225 		++swb[j]->swb_locked;
1226 		if( failed) {
1227 			rtvals[j] = VM_PAGER_FAIL;
1228 			continue;
1229 		}
1230 		off = swap_pager_block_offset(swp, foff);
1231 		reqaddr[j] = swb[j]->swb_block[off];
1232 		if( reqaddr[j] == SWB_EMPTY) {
1233 			int blk;
1234 			int tries;
1235 			int ntoget;
1236 			tries = 0;
1237 			s = splbio();
1238 
1239 			/*
1240 			 * if any other pages have been allocated in this block, we
1241 			 * only try to get one page.
1242 			 */
1243 			for (i = 0; i < SWB_NPAGES; i++) {
1244 				if (swb[j]->swb_block[i] != SWB_EMPTY)
1245 					break;
1246 			}
1247 
1248 
1249 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
1250 			/*
1251 			 * this code is alittle conservative, but works
1252 			 * (the intent of this code is to allocate small chunks
1253 			 *  for small objects)
1254 			 */
1255 			if( (m[j]->offset == 0) && (ntoget*PAGE_SIZE > object->size)) {
1256 				ntoget = (object->size + (PAGE_SIZE-1))/PAGE_SIZE;
1257 			}
1258 
1259 retrygetspace:
1260 			if (!swap_pager_full && ntoget > 1 &&
1261 				swap_pager_getswapspace(ntoget * btodb(PAGE_SIZE), &blk)) {
1262 
1263 				for (i = 0; i < ntoget; i++) {
1264 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
1265 					swb[j]->swb_valid = 0;
1266 				}
1267 
1268 				reqaddr[j] = swb[j]->swb_block[off];
1269 			} else if (!swap_pager_getswapspace(btodb(PAGE_SIZE),
1270 				&swb[j]->swb_block[off])) {
1271 				/*
1272 				 * if the allocation has failed, we try to reclaim space and
1273 				 * retry.
1274 				 */
1275 				if (++tries == 1) {
1276 					swap_pager_reclaim();
1277 					goto retrygetspace;
1278 				}
1279 				rtvals[j] = VM_PAGER_AGAIN;
1280 				failed = 1;
1281 			} else {
1282 				reqaddr[j] = swb[j]->swb_block[off];
1283 				swb[j]->swb_valid &= ~(1<<off);
1284 			}
1285 			splx(s);
1286 		}
1287 	}
1288 
1289 	/*
1290 	 * search forwards for the last contiguous page to transfer
1291 	 */
1292 	failed = 0;
1293 	for (i = 0; i < count; i++) {
1294 		if( failed || (reqaddr[i] != reqaddr[0] + i*btodb(PAGE_SIZE)) ||
1295 			(reqaddr[i] / dmmax) != (reqaddr[0] / dmmax) ||
1296 			(rtvals[i] != VM_PAGER_OK)) {
1297 			failed = 1;
1298 			if( rtvals[i] == VM_PAGER_OK)
1299 				rtvals[i] = VM_PAGER_AGAIN;
1300 		}
1301 	}
1302 
1303 	for(i = 0; i < count; i++) {
1304 		if( rtvals[i] != VM_PAGER_OK) {
1305 			if( swb[i])
1306 				--swb[i]->swb_locked;
1307 		}
1308 	}
1309 
1310 	for(i = 0; i < count; i++)
1311 		if( rtvals[i] != VM_PAGER_OK)
1312 			break;
1313 
1314 	if( i == 0) {
1315 		return VM_PAGER_AGAIN;
1316 	}
1317 
1318 	count = i;
1319 	for(i=0;i<count;i++) {
1320 		if( reqaddr[i] == SWB_EMPTY)
1321 			printf("I/O to empty block????\n");
1322 	}
1323 
1324 	/*
1325 	 */
1326 
1327 	/*
1328 	 * For synchronous writes, we clean up
1329 	 * all completed async pageouts.
1330 	 */
1331 	if ((flags & B_ASYNC) == 0) {
1332 		swap_pager_clean();
1333 	}
1334 
1335 	kva = 0;
1336 
1337 	/*
1338 	 * we allocate a new kva for transfers > 1 page
1339 	 * but for transfers == 1 page, the swap_pager_free list contains
1340 	 * entries that have pre-allocated kva's (for efficiency).
1341 	 * NOTE -- we do not use the physical buffer pool or the
1342 	 * preallocated associated kva's because of the potential for
1343 	 * deadlock.  This is very subtile -- but deadlocks or resource
1344 	 * contention must be avoided on pageouts -- or your system will
1345 	 * sleep (forever) !!!
1346 	 */
1347 /*
1348 	if ( count > 1) {
1349 		kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE);
1350 		if( !kva) {
1351 			for (i = 0; i < count; i++) {
1352 				if( swb[i])
1353 					--swb[i]->swb_locked;
1354 				rtvals[i] = VM_PAGER_AGAIN;
1355 			}
1356 			return VM_PAGER_AGAIN;
1357 		}
1358 	}
1359 */
1360 
1361 	/*
1362 	 * get a swap pager clean data structure, block until we get it
1363 	 */
1364 	if (swap_pager_free.tqh_first == NULL) {
1365 		s = splbio();
1366 		if( curproc == pageproc)
1367 			(void) swap_pager_clean();
1368 		else
1369 			wakeup((caddr_t) &vm_pages_needed);
1370 		while (swap_pager_free.tqh_first == NULL) {
1371 			swap_pager_needflags |= SWAP_FREE_NEEDED;
1372 			tsleep((caddr_t)&swap_pager_free,
1373 				PVM, "swpfre", 0);
1374 			if( curproc == pageproc)
1375 				(void) swap_pager_clean();
1376 			else
1377 				wakeup((caddr_t) &vm_pages_needed);
1378 		}
1379 		splx(s);
1380 	}
1381 
1382 	spc = swap_pager_free.tqh_first;
1383 	TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1384 
1385 	kva = spc->spc_kva;
1386 
1387 	/*
1388 	 * map our page(s) into kva for I/O
1389 	 */
1390 	pmap_qenter(kva, m, count);
1391 
1392 	/*
1393 	 * get the base I/O offset into the swap file
1394 	 */
1395 	for(i=0;i<count;i++) {
1396 		foff = m[i]->offset + paging_offset;
1397 		off = swap_pager_block_offset(swp, foff);
1398 		/*
1399 		 * if we are setting the valid bit anew,
1400 		 * then diminish the swap free space
1401 		 */
1402 		if( (swb[i]->swb_valid & (1 << off)) == 0)
1403 			vm_swap_size -= btodb(PAGE_SIZE);
1404 
1405 		/*
1406 		 * set the valid bit
1407 		 */
1408 		swb[i]->swb_valid |= (1 << off);
1409 		/*
1410 		 * and unlock the data structure
1411 		 */
1412 		--swb[i]->swb_locked;
1413 	}
1414 
1415 	s = splbio();
1416 	/*
1417 	 * Get a swap buffer header and perform the IO
1418 	 */
1419 	bp = spc->spc_bp;
1420 	bzero(bp, sizeof *bp);
1421 	bp->b_spc = spc;
1422 
1423 	bp->b_flags = B_BUSY;
1424 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1425 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1426 	if( bp->b_rcred != NOCRED)
1427 		crhold(bp->b_rcred);
1428 	if( bp->b_wcred != NOCRED)
1429 		crhold(bp->b_wcred);
1430 	bp->b_data = (caddr_t) kva;
1431 	bp->b_blkno = reqaddr[0];
1432 	bgetvp( swapdev_vp, bp);
1433 
1434 	bp->b_bcount = PAGE_SIZE*count;
1435 	bp->b_bufsize = PAGE_SIZE*count;
1436 	swapdev_vp->v_numoutput++;
1437 
1438 	/*
1439 	 * If this is an async write we set up additional buffer fields
1440 	 * and place a "cleaning" entry on the inuse queue.
1441 	 */
1442 	if ( flags & B_ASYNC ) {
1443 		spc->spc_flags = 0;
1444 		spc->spc_swp = swp;
1445 		for(i=0;i<count;i++)
1446 			spc->spc_m[i] = m[i];
1447 		spc->spc_count = count;
1448 		/*
1449 		 * the completion routine for async writes
1450 		 */
1451 		bp->b_flags |= B_CALL;
1452 		bp->b_iodone = swap_pager_iodone;
1453 		bp->b_dirtyoff = 0;
1454 		bp->b_dirtyend = bp->b_bcount;
1455 		swp->sw_poip++;
1456 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
1457 	} else {
1458 		swp->sw_poip++;
1459 		bp->b_flags |= B_CALL;
1460 		bp->b_iodone = swap_pager_iodone1;
1461 	}
1462 
1463 	cnt.v_swapout++;
1464 	cnt.v_swappgsout += count;
1465 	/*
1466 	 * perform the I/O
1467 	 */
1468 	VOP_STRATEGY(bp);
1469 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC ) {
1470 		if ((bp->b_flags & B_DONE) == B_DONE) {
1471 			swap_pager_clean();
1472 		}
1473 		splx(s);
1474 		for(i=0;i<count;i++) {
1475 			rtvals[i] = VM_PAGER_PEND;
1476 		}
1477 		return VM_PAGER_PEND;
1478 	}
1479 
1480 	/*
1481 	 * wait for the sync I/O to complete
1482 	 */
1483 	while ((bp->b_flags & B_DONE) == 0) {
1484 		tsleep((caddr_t)bp, PVM, "swwrt", 0);
1485 	}
1486 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1487 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1488 
1489 	--swp->sw_poip;
1490 	if (swp->sw_poip == 0)
1491 		wakeup((caddr_t) swp);
1492 
1493 	if (bp->b_vp)
1494 		brelvp(bp);
1495 
1496 	splx(s);
1497 
1498 	/*
1499 	 * remove the mapping for kernel virtual
1500 	 */
1501 	pmap_qremove( kva, count);
1502 
1503 	/*
1504 	 * if we have written the page, then indicate that the page
1505 	 * is clean.
1506 	 */
1507 	if (rv == VM_PAGER_OK) {
1508 		for(i=0;i<count;i++) {
1509 			if( rtvals[i] == VM_PAGER_OK) {
1510 				m[i]->flags |= PG_CLEAN;
1511 				m[i]->flags &= ~PG_LAUNDRY;
1512 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1513 				/*
1514 				 * optimization, if a page has been read during the
1515 				 * pageout process, we activate it.
1516 				 */
1517 				if ( (m[i]->flags & PG_ACTIVE) == 0 &&
1518 					pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))
1519 					vm_page_activate(m[i]);
1520 			}
1521 		}
1522 	} else {
1523 		for(i=0;i<count;i++) {
1524 			rtvals[i] = rv;
1525 			m[i]->flags |= PG_LAUNDRY;
1526 		}
1527 	}
1528 
1529 	if( bp->b_rcred != NOCRED)
1530 		crfree(bp->b_rcred);
1531 	if( bp->b_wcred != NOCRED)
1532 		crfree(bp->b_wcred);
1533 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1534 	if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1535 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1536 		wakeup((caddr_t)&swap_pager_free);
1537 	}
1538 
1539 	return(rv);
1540 }
1541 
1542 boolean_t
1543 swap_pager_clean()
1544 {
1545 	register swp_clean_t spc, tspc;
1546 	register int s;
1547 
1548 	tspc = NULL;
1549 	if (swap_pager_done.tqh_first == NULL)
1550 		return FALSE;
1551 	for (;;) {
1552 		s = splbio();
1553 		/*
1554 		 * Look up and removal from done list must be done
1555 		 * at splbio() to avoid conflicts with swap_pager_iodone.
1556 		 */
1557 		while ((spc = swap_pager_done.tqh_first) != 0) {
1558 			pmap_qremove( spc->spc_kva, spc->spc_count);
1559 			swap_pager_finish(spc);
1560 			TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
1561 			goto doclean;
1562 		}
1563 
1564 		/*
1565 		 * No operations done, thats all we can do for now.
1566 		 */
1567 
1568 		splx(s);
1569 		break;
1570 
1571 		/*
1572 		 * The desired page was found to be busy earlier in
1573 		 * the scan but has since completed.
1574 		 */
1575 doclean:
1576 		if (tspc && tspc == spc) {
1577 			tspc = NULL;
1578 		}
1579 		spc->spc_flags = 0;
1580 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1581 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1582 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1583 			wakeup((caddr_t)&swap_pager_free);
1584 		}
1585 		++cleandone;
1586 		splx(s);
1587 	}
1588 
1589 	return(tspc ? TRUE : FALSE);
1590 }
1591 
1592 void
1593 swap_pager_finish(spc)
1594 	register swp_clean_t spc;
1595 {
1596 	vm_object_t object = spc->spc_m[0]->object;
1597 	int i;
1598 
1599 	if ((object->paging_in_progress -= spc->spc_count) == 0)
1600 		thread_wakeup((int) object);
1601 
1602 	/*
1603 	 * If no error mark as clean and inform the pmap system.
1604 	 * If error, mark as dirty so we will try again.
1605 	 * (XXX could get stuck doing this, should give up after awhile)
1606 	 */
1607 	if (spc->spc_flags & SPC_ERROR) {
1608 		for(i=0;i<spc->spc_count;i++) {
1609 			printf("swap_pager_finish: clean of page %lx failed\n",
1610 			       (u_long)VM_PAGE_TO_PHYS(spc->spc_m[i]));
1611 			spc->spc_m[i]->flags |= PG_LAUNDRY;
1612 		}
1613 	} else {
1614 		for(i=0;i<spc->spc_count;i++) {
1615 			pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i]));
1616 			spc->spc_m[i]->flags |= PG_CLEAN;
1617 		}
1618 	}
1619 
1620 
1621 	for(i=0;i<spc->spc_count;i++) {
1622 		/*
1623 		 * we wakeup any processes that are waiting on
1624 		 * these pages.
1625 		 */
1626 		PAGE_WAKEUP(spc->spc_m[i]);
1627 	}
1628 	nswiodone -= spc->spc_count;
1629 
1630 	return;
1631 }
1632 
1633 /*
1634  * swap_pager_iodone
1635  */
1636 void
1637 swap_pager_iodone(bp)
1638 	register struct buf *bp;
1639 {
1640 	register swp_clean_t spc;
1641 	int s;
1642 
1643 	s = splbio();
1644 	spc = (swp_clean_t) bp->b_spc;
1645 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
1646 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
1647 	if (bp->b_flags & B_ERROR) {
1648 		spc->spc_flags |= SPC_ERROR;
1649 		printf("error %d blkno %lu sz %ld ",
1650 			bp->b_error, (u_long)bp->b_blkno, bp->b_bcount);
1651 	}
1652 
1653 /*
1654 	if ((bp->b_flags & B_READ) == 0)
1655 		vwakeup(bp);
1656 */
1657 
1658 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_ASYNC);
1659 	if (bp->b_vp) {
1660 		brelvp(bp);
1661 	}
1662 	if( bp->b_rcred != NOCRED)
1663 		crfree(bp->b_rcred);
1664 	if( bp->b_wcred != NOCRED)
1665 		crfree(bp->b_wcred);
1666 
1667 	nswiodone += spc->spc_count;
1668 	if (--spc->spc_swp->sw_poip == 0) {
1669 		wakeup((caddr_t)spc->spc_swp);
1670 	}
1671 
1672 	if ((swap_pager_needflags & SWAP_FREE_NEEDED) ||
1673 	    swap_pager_inuse.tqh_first == 0) {
1674 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1675 		wakeup((caddr_t)&swap_pager_free);
1676 		wakeup((caddr_t)&vm_pages_needed);
1677 	}
1678 
1679 	if (vm_pageout_pages_needed) {
1680 		wakeup((caddr_t)&vm_pageout_pages_needed);
1681 	}
1682 
1683 	if ((swap_pager_inuse.tqh_first == NULL) ||
1684 	    (cnt.v_free_count < cnt.v_free_min &&
1685 	    nswiodone + cnt.v_free_count >= cnt.v_free_min) ) {
1686 		wakeup((caddr_t)&vm_pages_needed);
1687 	}
1688 	splx(s);
1689 }
1690 
1691 /*
1692  * return true if any swap control structures can be allocated
1693  */
1694 int
1695 swap_pager_ready() {
1696 	if( swap_pager_free.tqh_first)
1697 		return 1;
1698 	else
1699 		return 0;
1700 }
1701