xref: /freebsd/sys/vm/swap_pager.c (revision 05f0fdd26aa1789c04ae89358880922a54d197c3)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * Copyright (c) 1990 University of Utah.
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * the Systems Programming Group of the University of Utah Computer
9  * Science Department.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
40  *
41  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
42  * $Id: swap_pager.c,v 1.9 1994/09/25 04:02:10 davidg Exp $
43  */
44 
45 /*
46  * Quick hack to page to dedicated partition(s).
47  * TODO:
48  *	Add multiprocessor locks
49  *	Deal with async writes in a better fashion
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/proc.h>
55 #include <sys/buf.h>
56 #include <sys/vnode.h>
57 #include <sys/malloc.h>
58 
59 #include <miscfs/specfs/specdev.h>
60 #include <sys/rlist.h>
61 
62 #include <vm/vm.h>
63 #include <vm/vm_pager.h>
64 #include <vm/vm_page.h>
65 #include <vm/vm_pageout.h>
66 #include <vm/swap_pager.h>
67 
68 #ifndef NPENDINGIO
69 #define NPENDINGIO	16
70 #endif
71 
72 int	swap_pager_input __P((sw_pager_t, vm_page_t *, int, int));
73 int	swap_pager_output __P((sw_pager_t, vm_page_t *, int, int, int *));
74 
75 int nswiodone;
76 extern int vm_pageout_rate_limit;
77 static int cleandone;
78 extern int hz;
79 int swap_pager_full;
80 extern vm_map_t pager_map;
81 extern int vm_swap_size;
82 
83 #define MAX_PAGEOUT_CLUSTER 8
84 
85 TAILQ_HEAD(swpclean, swpagerclean);
86 
87 typedef	struct swpagerclean	*swp_clean_t;
88 
89 struct swpagerclean {
90 	TAILQ_ENTRY(swpagerclean)	spc_list;
91 	int				spc_flags;
92 	struct buf			*spc_bp;
93 	sw_pager_t			spc_swp;
94 	vm_offset_t			spc_kva;
95 	int				spc_count;
96 	vm_page_t			spc_m[MAX_PAGEOUT_CLUSTER];
97 } swcleanlist [NPENDINGIO] ;
98 
99 
100 extern vm_map_t kernel_map;
101 
102 /* spc_flags values */
103 #define SPC_ERROR	0x01
104 
105 #define SWB_EMPTY (-1)
106 
107 struct swpclean swap_pager_done;	/* list of compileted page cleans */
108 struct swpclean swap_pager_inuse;	/* list of pending page cleans */
109 struct swpclean swap_pager_free;	/* list of free pager clean structs */
110 struct pagerlst swap_pager_list;	/* list of "named" anon regions */
111 struct pagerlst swap_pager_un_list;	/* list of "unnamed" anon pagers */
112 
113 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
114 int swap_pager_needflags;
115 struct rlist *swapfrag;
116 
117 struct pagerlst *swp_qs[]={
118 	&swap_pager_list, &swap_pager_un_list, (struct pagerlst *) 0
119 };
120 
121 int swap_pager_putmulti();
122 
123 struct pagerops swappagerops = {
124 	swap_pager_init,
125 	swap_pager_alloc,
126 	swap_pager_dealloc,
127 	swap_pager_getpage,
128 	swap_pager_getmulti,
129 	swap_pager_putpage,
130 	swap_pager_putmulti,
131 	swap_pager_haspage
132 };
133 
134 int npendingio = NPENDINGIO;
135 int pendingiowait;
136 int require_swap_init;
137 void swap_pager_finish();
138 int dmmin, dmmax;
139 extern int vm_page_count;
140 
141 struct buf * getpbuf() ;
142 void relpbuf(struct buf *bp) ;
143 
144 static inline void swapsizecheck() {
145 	if( vm_swap_size < 128*btodb(PAGE_SIZE)) {
146 		if( swap_pager_full)
147 			printf("swap_pager: out of space\n");
148 		swap_pager_full = 1;
149 	} else if( vm_swap_size > 192*btodb(PAGE_SIZE))
150 		swap_pager_full = 0;
151 }
152 
153 void
154 swap_pager_init()
155 {
156 	dfltpagerops = &swappagerops;
157 
158 	TAILQ_INIT(&swap_pager_list);
159 	TAILQ_INIT(&swap_pager_un_list);
160 
161 	/*
162 	 * Initialize clean lists
163 	 */
164 	TAILQ_INIT(&swap_pager_inuse);
165 	TAILQ_INIT(&swap_pager_done);
166 	TAILQ_INIT(&swap_pager_free);
167 
168 	require_swap_init = 1;
169 
170 	/*
171 	 * Calculate the swap allocation constants.
172 	 */
173 
174 	dmmin = CLBYTES/DEV_BSIZE;
175 	dmmax = btodb(SWB_NPAGES*PAGE_SIZE)*2;
176 
177 }
178 
179 /*
180  * Allocate a pager structure and associated resources.
181  * Note that if we are called from the pageout daemon (handle == NULL)
182  * we should not wait for memory as it could resulting in deadlock.
183  */
184 vm_pager_t
185 swap_pager_alloc(handle, size, prot, offset)
186 	caddr_t handle;
187 	register vm_size_t size;
188 	vm_prot_t prot;
189 	vm_offset_t offset;
190 {
191 	register vm_pager_t pager;
192 	register sw_pager_t swp;
193 	int waitok;
194 	int i,j;
195 
196 	if (require_swap_init) {
197 		swp_clean_t spc;
198 		struct buf *bp;
199 		/*
200 		 * kva's are allocated here so that we dont need to keep
201 		 * doing kmem_alloc pageables at runtime
202 		 */
203 		for (i = 0, spc = swcleanlist; i < npendingio ; i++, spc++) {
204 			spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE*MAX_PAGEOUT_CLUSTER);
205 			if (!spc->spc_kva) {
206 				break;
207 			}
208 			spc->spc_bp = malloc( sizeof( *bp), M_TEMP, M_NOWAIT);
209 			if (!spc->spc_bp) {
210 				kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
211 				break;
212 			}
213 			spc->spc_flags = 0;
214 			TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
215 		}
216 		require_swap_init = 0;
217 		if( size == 0)
218 			return(NULL);
219 	}
220 
221 	/*
222 	 * If this is a "named" anonymous region, look it up and
223 	 * return the appropriate pager if it exists.
224 	 */
225 	if (handle) {
226 		pager = vm_pager_lookup(&swap_pager_list, handle);
227 		if (pager != NULL) {
228 			/*
229 			 * Use vm_object_lookup to gain a reference
230 			 * to the object and also to remove from the
231 			 * object cache.
232 			 */
233 			if (vm_object_lookup(pager) == NULL)
234 				panic("swap_pager_alloc: bad object");
235 			return(pager);
236 		}
237 	}
238 
239 	if (swap_pager_full) {
240 		return(NULL);
241 	}
242 
243 	/*
244 	 * Pager doesn't exist, allocate swap management resources
245 	 * and initialize.
246 	 */
247 	waitok = handle ? M_WAITOK : M_NOWAIT;
248 	pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok);
249 	if (pager == NULL)
250 		return(NULL);
251 	swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok);
252 	if (swp == NULL) {
253 		free((caddr_t)pager, M_VMPAGER);
254 		return(NULL);
255 	}
256 	size = round_page(size);
257 	swp->sw_osize = size;
258 	swp->sw_nblocks = (btodb(size) + btodb(SWB_NPAGES * PAGE_SIZE) - 1) / btodb(SWB_NPAGES*PAGE_SIZE);
259 	swp->sw_blocks = (sw_blk_t)
260 		malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks),
261 		       M_VMPGDATA, waitok);
262 	if (swp->sw_blocks == NULL) {
263 		free((caddr_t)swp, M_VMPGDATA);
264 		free((caddr_t)pager, M_VMPAGER);
265 		return(NULL);
266 	}
267 
268 	for (i = 0; i < swp->sw_nblocks; i++) {
269 		swp->sw_blocks[i].swb_valid = 0;
270 		swp->sw_blocks[i].swb_locked = 0;
271 		for (j = 0; j < SWB_NPAGES; j++)
272 			swp->sw_blocks[i].swb_block[j] = SWB_EMPTY;
273 	}
274 
275 	swp->sw_poip = 0;
276 	if (handle) {
277 		vm_object_t object;
278 
279 		swp->sw_flags = SW_NAMED;
280 		TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list);
281 		/*
282 		 * Consistant with other pagers: return with object
283 		 * referenced.  Can't do this with handle == NULL
284 		 * since it might be the pageout daemon calling.
285 		 */
286 		object = vm_object_allocate(size);
287 		vm_object_enter(object, pager);
288 		vm_object_setpager(object, pager, 0, FALSE);
289 	} else {
290 		swp->sw_flags = 0;
291 		TAILQ_INSERT_TAIL(&swap_pager_un_list, pager, pg_list);
292 	}
293 	pager->pg_handle = handle;
294 	pager->pg_ops = &swappagerops;
295 	pager->pg_type = PG_SWAP;
296 	pager->pg_data = (caddr_t)swp;
297 
298 	return(pager);
299 }
300 
301 /*
302  * returns disk block associated with pager and offset
303  * additionally, as a side effect returns a flag indicating
304  * if the block has been written
305  */
306 
307 static int *
308 swap_pager_diskaddr(swp, offset, valid)
309 	sw_pager_t swp;
310 	vm_offset_t offset;
311 	int *valid;
312 {
313 	register sw_blk_t swb;
314 	int ix;
315 
316 	if (valid)
317 		*valid = 0;
318 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
319 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
320 		return(FALSE);
321 	}
322 	swb = &swp->sw_blocks[ix];
323 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
324 	if (valid)
325 		*valid = swb->swb_valid & (1<<ix);
326 	return &swb->swb_block[ix];
327 }
328 
329 /*
330  * Utility routine to set the valid (written) bit for
331  * a block associated with a pager and offset
332  */
333 static void
334 swap_pager_setvalid(swp, offset, valid)
335 	sw_pager_t swp;
336 	vm_offset_t offset;
337 	int valid;
338 {
339 	register sw_blk_t swb;
340 	int ix;
341 
342 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
343 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks)
344 		return;
345 
346 	swb = &swp->sw_blocks[ix];
347 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
348 	if (valid)
349 		swb->swb_valid |= (1 << ix);
350 	else
351 		swb->swb_valid &= ~(1 << ix);
352 	return;
353 }
354 
355 /*
356  * this routine allocates swap space with a fragmentation
357  * minimization policy.
358  */
359 int
360 swap_pager_getswapspace( unsigned amount, unsigned *rtval) {
361 #ifdef EXP
362 	unsigned tmpalloc;
363 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
364 	if( amount < nblocksfrag) {
365 		if( rlist_alloc(&swapfrag, amount, rtval))
366 			return 1;
367 		if( !rlist_alloc(&swapmap, nblocksfrag, &tmpalloc))
368 			return 0;
369 		rlist_free( &swapfrag, tmpalloc+amount, tmpalloc + nblocksfrag - 1);
370 		*rtval = tmpalloc;
371 		return 1;
372 	}
373 #endif
374 	if( !rlist_alloc(&swapmap, amount, rtval))
375 		return 0;
376 	else
377 		return 1;
378 }
379 
380 /*
381  * this routine frees swap space with a fragmentation
382  * minimization policy.
383  */
384 void
385 swap_pager_freeswapspace( unsigned from, unsigned to) {
386 #ifdef EXP
387 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
388 	unsigned tmpalloc;
389 	if( ((to + 1) - from) >= nblocksfrag) {
390 #endif
391 		rlist_free(&swapmap, from, to);
392 #ifdef EXP
393 		return;
394 	}
395 	rlist_free(&swapfrag, from, to);
396 	while( rlist_alloc(&swapfrag, nblocksfrag, &tmpalloc)) {
397 		rlist_free(&swapmap, tmpalloc, tmpalloc + nblocksfrag-1);
398 	}
399 #endif
400 }
401 /*
402  * this routine frees swap blocks from a specified pager
403  */
404 void
405 _swap_pager_freespace(swp, start, size)
406 	sw_pager_t swp;
407 	vm_offset_t start;
408 	vm_offset_t size;
409 {
410 	vm_offset_t i;
411 	int s;
412 
413 	s = splbio();
414 	for (i = start; i < round_page(start + size - 1); i += PAGE_SIZE) {
415 		int valid;
416 		int *addr = swap_pager_diskaddr(swp, i, &valid);
417 		if (addr && *addr != SWB_EMPTY) {
418 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
419 			if( valid) {
420 				vm_swap_size += btodb(PAGE_SIZE);
421 				swap_pager_setvalid(swp, i, 0);
422 			}
423 			*addr = SWB_EMPTY;
424 		}
425 	}
426 	swapsizecheck();
427 	splx(s);
428 }
429 
430 void
431 swap_pager_freespace(pager, start, size)
432 	vm_pager_t pager;
433 	vm_offset_t start;
434 	vm_offset_t size;
435 {
436 	_swap_pager_freespace((sw_pager_t) pager->pg_data, start, size);
437 }
438 
439 /*
440  * swap_pager_reclaim frees up over-allocated space from all pagers
441  * this eliminates internal fragmentation due to allocation of space
442  * for segments that are never swapped to. It has been written so that
443  * it does not block until the rlist_free operation occurs; it keeps
444  * the queues consistant.
445  */
446 
447 /*
448  * Maximum number of blocks (pages) to reclaim per pass
449  */
450 #define MAXRECLAIM 256
451 
452 void
453 swap_pager_reclaim()
454 {
455 	vm_pager_t p;
456 	sw_pager_t swp;
457 	int i, j, k;
458 	int s;
459 	int reclaimcount;
460 	static int reclaims[MAXRECLAIM];
461 	static int in_reclaim;
462 
463 /*
464  * allow only one process to be in the swap_pager_reclaim subroutine
465  */
466 	s = splbio();
467 	if (in_reclaim) {
468 		tsleep((caddr_t) &in_reclaim, PSWP, "swrclm", 0);
469 		splx(s);
470 		return;
471 	}
472 	in_reclaim = 1;
473 	reclaimcount = 0;
474 
475 	/* for each pager queue */
476 	for (k = 0; swp_qs[k]; k++) {
477 
478 		p = swp_qs[k]->tqh_first;
479 		while (p && (reclaimcount < MAXRECLAIM)) {
480 
481 			/*
482 			 * see if any blocks associated with a pager has been
483 			 * allocated but not used (written)
484 			 */
485 			swp = (sw_pager_t) p->pg_data;
486 			for (i = 0; i < swp->sw_nblocks; i++) {
487 				sw_blk_t swb = &swp->sw_blocks[i];
488 				if( swb->swb_locked)
489 					continue;
490 				for (j = 0; j < SWB_NPAGES; j++) {
491 					if (swb->swb_block[j] != SWB_EMPTY &&
492 						(swb->swb_valid & (1 << j)) == 0) {
493 						reclaims[reclaimcount++] = swb->swb_block[j];
494 						swb->swb_block[j] = SWB_EMPTY;
495 						if (reclaimcount >= MAXRECLAIM)
496 							goto rfinished;
497 					}
498 				}
499 			}
500 			p = p->pg_list.tqe_next;
501 		}
502 	}
503 
504 rfinished:
505 
506 /*
507  * free the blocks that have been added to the reclaim list
508  */
509 	for (i = 0; i < reclaimcount; i++) {
510 		swap_pager_freeswapspace(reclaims[i], reclaims[i]+btodb(PAGE_SIZE) - 1);
511 		swapsizecheck();
512 		wakeup((caddr_t) &in_reclaim);
513 	}
514 
515 	splx(s);
516 	in_reclaim = 0;
517 	wakeup((caddr_t) &in_reclaim);
518 }
519 
520 
521 /*
522  * swap_pager_copy copies blocks from one pager to another and
523  * destroys the source pager
524  */
525 
526 void
527 swap_pager_copy(srcpager, srcoffset, dstpager, dstoffset, offset)
528 	vm_pager_t srcpager;
529 	vm_offset_t srcoffset;
530 	vm_pager_t dstpager;
531 	vm_offset_t dstoffset;
532 	vm_offset_t offset;
533 {
534 	sw_pager_t srcswp, dstswp;
535 	vm_offset_t i;
536 	int s;
537 
538 	srcswp = (sw_pager_t) srcpager->pg_data;
539 	dstswp = (sw_pager_t) dstpager->pg_data;
540 
541 /*
542  * remove the source pager from the swap_pager internal queue
543  */
544 	s = splbio();
545 	if (srcswp->sw_flags & SW_NAMED) {
546 		TAILQ_REMOVE(&swap_pager_list, srcpager, pg_list);
547 		srcswp->sw_flags &= ~SW_NAMED;
548 	} else {
549 		TAILQ_REMOVE(&swap_pager_un_list, srcpager, pg_list);
550 	}
551 
552 	while (srcswp->sw_poip) {
553 		tsleep((caddr_t)srcswp, PVM, "spgout", 0);
554 	}
555 	splx(s);
556 
557 /*
558  * clean all of the pages that are currently active and finished
559  */
560 	(void) swap_pager_clean();
561 
562 	s = splbio();
563 /*
564  * clear source block before destination object
565  * (release allocated space)
566  */
567 	for (i = 0; i < offset + srcoffset; i += PAGE_SIZE) {
568 		int valid;
569 		int *addr = swap_pager_diskaddr(srcswp, i, &valid);
570 		if (addr && *addr != SWB_EMPTY) {
571 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
572 			if( valid)
573 				vm_swap_size += btodb(PAGE_SIZE);
574 			swapsizecheck();
575 			*addr = SWB_EMPTY;
576 		}
577 	}
578 /*
579  * transfer source to destination
580  */
581 	for (i = 0; i < dstswp->sw_osize; i += PAGE_SIZE) {
582 		int srcvalid, dstvalid;
583 		int *srcaddrp = swap_pager_diskaddr(srcswp, i + offset + srcoffset,
584 			&srcvalid);
585 		int *dstaddrp;
586 	/*
587 	 * see if the source has space allocated
588 	 */
589 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
590 		/*
591 		 * if the source is valid and the dest has no space, then
592 		 * copy the allocation from the srouce to the dest.
593 		 */
594 			if (srcvalid) {
595 				dstaddrp = swap_pager_diskaddr(dstswp, i + dstoffset, &dstvalid);
596 				/*
597 				 * if the dest already has a valid block, deallocate the
598 				 * source block without copying.
599 				 */
600 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
601 					swap_pager_freeswapspace(*dstaddrp, *dstaddrp+btodb(PAGE_SIZE) - 1);
602 					*dstaddrp = SWB_EMPTY;
603 				}
604 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
605 					*dstaddrp = *srcaddrp;
606 					*srcaddrp = SWB_EMPTY;
607 					swap_pager_setvalid(dstswp, i + dstoffset, 1);
608 					vm_swap_size -= btodb(PAGE_SIZE);
609 				}
610 			}
611 		/*
612 		 * if the source is not empty at this point, then deallocate the space.
613 		 */
614 			if (*srcaddrp != SWB_EMPTY) {
615 				swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
616 				if( srcvalid)
617 					vm_swap_size += btodb(PAGE_SIZE);
618 				*srcaddrp = SWB_EMPTY;
619 			}
620 		}
621 	}
622 
623 /*
624  * deallocate the rest of the source object
625  */
626 	for (i = dstswp->sw_osize + offset + srcoffset; i < srcswp->sw_osize; i += PAGE_SIZE) {
627 		int valid;
628 		int *srcaddrp = swap_pager_diskaddr(srcswp, i, &valid);
629 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
630 			swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
631 			if( valid)
632 				vm_swap_size += btodb(PAGE_SIZE);
633 			*srcaddrp = SWB_EMPTY;
634 		}
635 	}
636 
637 	swapsizecheck();
638 	splx(s);
639 
640 	free((caddr_t)srcswp->sw_blocks, M_VMPGDATA);
641 	srcswp->sw_blocks = 0;
642 	free((caddr_t)srcswp, M_VMPGDATA);
643 	srcpager->pg_data = 0;
644 	free((caddr_t)srcpager, M_VMPAGER);
645 
646 	return;
647 }
648 
649 
650 void
651 swap_pager_dealloc(pager)
652 	vm_pager_t pager;
653 {
654 	register int i,j;
655 	register sw_blk_t bp;
656 	register sw_pager_t swp;
657 	int s;
658 
659 	/*
660 	 * Remove from list right away so lookups will fail if we
661 	 * block for pageout completion.
662 	 */
663 	s = splbio();
664 	swp = (sw_pager_t) pager->pg_data;
665 	if (swp->sw_flags & SW_NAMED) {
666 		TAILQ_REMOVE(&swap_pager_list, pager, pg_list);
667 		swp->sw_flags &= ~SW_NAMED;
668 	} else {
669 		TAILQ_REMOVE(&swap_pager_un_list, pager, pg_list);
670 	}
671 	/*
672 	 * Wait for all pageouts to finish and remove
673 	 * all entries from cleaning list.
674 	 */
675 
676 	while (swp->sw_poip) {
677 		tsleep((caddr_t)swp, PVM, "swpout", 0);
678 	}
679 	splx(s);
680 
681 
682 	(void) swap_pager_clean();
683 
684 	/*
685 	 * Free left over swap blocks
686 	 */
687 	s = splbio();
688 	for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) {
689 		for (j = 0; j < SWB_NPAGES; j++)
690 		if (bp->swb_block[j] != SWB_EMPTY) {
691 			swap_pager_freeswapspace((unsigned)bp->swb_block[j],
692 				(unsigned)bp->swb_block[j] + btodb(PAGE_SIZE) - 1);
693 			if( bp->swb_valid & (1<<j))
694 				vm_swap_size += btodb(PAGE_SIZE);
695 			bp->swb_block[j] = SWB_EMPTY;
696 		}
697 	}
698 	splx(s);
699 	swapsizecheck();
700 
701 	/*
702 	 * Free swap management resources
703 	 */
704 	free((caddr_t)swp->sw_blocks, M_VMPGDATA);
705 	swp->sw_blocks = 0;
706 	free((caddr_t)swp, M_VMPGDATA);
707 	pager->pg_data = 0;
708 	free((caddr_t)pager, M_VMPAGER);
709 }
710 
711 /*
712  * swap_pager_getmulti can get multiple pages.
713  */
714 int
715 swap_pager_getmulti(pager, m, count, reqpage, sync)
716 	vm_pager_t pager;
717 	vm_page_t *m;
718 	int count;
719 	int reqpage;
720 	boolean_t sync;
721 {
722 	if( reqpage >= count)
723 		panic("swap_pager_getmulti: reqpage >= count\n");
724 	return swap_pager_input((sw_pager_t) pager->pg_data, m, count, reqpage);
725 }
726 
727 /*
728  * swap_pager_getpage gets individual pages
729  */
730 int
731 swap_pager_getpage(pager, m, sync)
732 	vm_pager_t pager;
733 	vm_page_t m;
734 	boolean_t sync;
735 {
736 	vm_page_t marray[1];
737 
738 	marray[0] = m;
739 	return swap_pager_input((sw_pager_t)pager->pg_data, marray, 1, 0);
740 }
741 
742 int
743 swap_pager_putmulti(pager, m, c, sync, rtvals)
744 	vm_pager_t pager;
745 	vm_page_t *m;
746 	int c;
747 	boolean_t sync;
748 	int *rtvals;
749 {
750 	int flags;
751 
752 	if (pager == NULL) {
753 		(void) swap_pager_clean();
754 		return VM_PAGER_OK;
755 	}
756 
757 	flags = B_WRITE;
758 	if (!sync)
759 		flags |= B_ASYNC;
760 
761 	return swap_pager_output((sw_pager_t)pager->pg_data, m, c, flags, rtvals);
762 }
763 
764 /*
765  * swap_pager_putpage writes individual pages
766  */
767 int
768 swap_pager_putpage(pager, m, sync)
769 	vm_pager_t pager;
770 	vm_page_t m;
771 	boolean_t sync;
772 {
773 	int flags;
774 	vm_page_t marray[1];
775 	int rtvals[1];
776 
777 
778 	if (pager == NULL) {
779 		(void) swap_pager_clean();
780 		return VM_PAGER_OK;
781 	}
782 
783 	marray[0] = m;
784 	flags = B_WRITE;
785 	if (!sync)
786 		flags |= B_ASYNC;
787 
788 	swap_pager_output((sw_pager_t)pager->pg_data, marray, 1, flags, rtvals);
789 
790 	return rtvals[0];
791 }
792 
793 static inline int
794 const swap_pager_block_index(swp, offset)
795 	sw_pager_t swp;
796 	vm_offset_t offset;
797 {
798 	return (offset / (SWB_NPAGES*PAGE_SIZE));
799 }
800 
801 static inline int
802 const swap_pager_block_offset(swp, offset)
803 	sw_pager_t swp;
804 	vm_offset_t offset;
805 {
806 	return ((offset % (PAGE_SIZE*SWB_NPAGES)) / PAGE_SIZE);
807 }
808 
809 /*
810  * _swap_pager_haspage returns TRUE if the pager has data that has
811  * been written out.
812  */
813 static boolean_t
814 _swap_pager_haspage(swp, offset)
815 	sw_pager_t swp;
816 	vm_offset_t offset;
817 {
818 	register sw_blk_t swb;
819 	int ix;
820 
821 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
822 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
823 		return(FALSE);
824 	}
825 	swb = &swp->sw_blocks[ix];
826 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
827 	if (swb->swb_block[ix] != SWB_EMPTY) {
828 		if (swb->swb_valid & (1 << ix))
829 			return TRUE;
830 	}
831 
832 	return(FALSE);
833 }
834 
835 /*
836  * swap_pager_haspage is the externally accessible version of
837  * _swap_pager_haspage above.  this routine takes a vm_pager_t
838  * for an argument instead of sw_pager_t.
839  */
840 boolean_t
841 swap_pager_haspage(pager, offset)
842 	vm_pager_t pager;
843 	vm_offset_t offset;
844 {
845 	return _swap_pager_haspage((sw_pager_t) pager->pg_data, offset);
846 }
847 
848 /*
849  * swap_pager_freepage is a convienience routine that clears the busy
850  * bit and deallocates a page.
851  */
852 static void
853 swap_pager_freepage(m)
854 	vm_page_t m;
855 {
856 	PAGE_WAKEUP(m);
857 	vm_page_free(m);
858 }
859 
860 /*
861  * swap_pager_ridpages is a convienience routine that deallocates all
862  * but the required page.  this is usually used in error returns that
863  * need to invalidate the "extra" readahead pages.
864  */
865 static void
866 swap_pager_ridpages(m, count, reqpage)
867 	vm_page_t *m;
868 	int count;
869 	int reqpage;
870 {
871 	int i;
872 	for (i = 0; i < count; i++)
873 		if (i != reqpage)
874 			swap_pager_freepage(m[i]);
875 }
876 
877 int swapwritecount=0;
878 
879 /*
880  * swap_pager_iodone1 is the completion routine for both reads and async writes
881  */
882 void
883 swap_pager_iodone1(bp)
884 	struct buf *bp;
885 {
886 	bp->b_flags |= B_DONE;
887 	bp->b_flags &= ~B_ASYNC;
888 	wakeup((caddr_t)bp);
889 /*
890 	if ((bp->b_flags & B_READ) == 0)
891 		vwakeup(bp);
892 */
893 }
894 
895 
896 int
897 swap_pager_input(swp, m, count, reqpage)
898 	register sw_pager_t swp;
899 	vm_page_t *m;
900 	int count, reqpage;
901 {
902 	register struct buf *bp;
903 	sw_blk_t swb[count];
904 	register int s;
905 	int i;
906 	boolean_t rv;
907 	vm_offset_t kva, off[count];
908 	swp_clean_t spc;
909 	vm_offset_t paging_offset;
910 	vm_object_t object;
911 	int reqaddr[count];
912 
913 	int first, last;
914 	int failed;
915 	int reqdskregion;
916 
917 	object = m[reqpage]->object;
918 	paging_offset = object->paging_offset;
919 	/*
920 	 * First determine if the page exists in the pager if this is
921 	 * a sync read.  This quickly handles cases where we are
922 	 * following shadow chains looking for the top level object
923 	 * with the page.
924 	 */
925 	if (swp->sw_blocks == NULL) {
926 		swap_pager_ridpages(m, count, reqpage);
927 		return(VM_PAGER_FAIL);
928 	}
929 
930 	for(i = 0; i < count; i++) {
931 		vm_offset_t foff = m[i]->offset + paging_offset;
932 		int ix = swap_pager_block_index(swp, foff);
933 		if (ix >= swp->sw_nblocks) {
934 			int j;
935 			if( i <= reqpage) {
936 				swap_pager_ridpages(m, count, reqpage);
937 				return(VM_PAGER_FAIL);
938 			}
939 			for(j = i; j < count; j++) {
940 				swap_pager_freepage(m[j]);
941 			}
942 			count = i;
943 			break;
944 		}
945 
946 		swb[i] = &swp->sw_blocks[ix];
947 		off[i] = swap_pager_block_offset(swp, foff);
948 		reqaddr[i] = swb[i]->swb_block[off[i]];
949 	}
950 
951 	/* make sure that our required input request is existant */
952 
953 	if (reqaddr[reqpage] == SWB_EMPTY ||
954 		(swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
955 		swap_pager_ridpages(m, count, reqpage);
956 		return(VM_PAGER_FAIL);
957 	}
958 
959 
960 	reqdskregion = reqaddr[reqpage] / dmmax;
961 
962 	/*
963 	 * search backwards for the first contiguous page to transfer
964 	 */
965 	failed = 0;
966 	first = 0;
967 	for (i = reqpage - 1; i >= 0; --i) {
968 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
969 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
970 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
971 			((reqaddr[i] / dmmax) != reqdskregion)) {
972 				failed = 1;
973 				swap_pager_freepage(m[i]);
974 				if (first == 0)
975 					first = i + 1;
976 		}
977 	}
978 	/*
979 	 * search forwards for the last contiguous page to transfer
980 	 */
981 	failed = 0;
982 	last = count;
983 	for (i = reqpage + 1; i < count; i++) {
984 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
985 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
986 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
987 			((reqaddr[i] / dmmax) != reqdskregion)) {
988 				failed = 1;
989 				swap_pager_freepage(m[i]);
990 				if (last == count)
991 					last = i;
992 		}
993 	}
994 
995 	count = last;
996 	if (first != 0) {
997 		for (i = first; i < count; i++) {
998 			m[i-first] = m[i];
999 			reqaddr[i-first] = reqaddr[i];
1000 			off[i-first] = off[i];
1001 		}
1002 		count -= first;
1003 		reqpage -= first;
1004 	}
1005 
1006 	++swb[reqpage]->swb_locked;
1007 
1008 	/*
1009 	 * at this point:
1010 	 * "m" is a pointer to the array of vm_page_t for paging I/O
1011 	 * "count" is the number of vm_page_t entries represented by "m"
1012 	 * "object" is the vm_object_t for I/O
1013 	 * "reqpage" is the index into "m" for the page actually faulted
1014 	 */
1015 
1016 	spc = NULL;	/* we might not use an spc data structure */
1017 
1018 	if (count == 1) {
1019 		/*
1020 		 * if a kva has not been allocated, we can only do a one page transfer,
1021 		 * so we free the other pages that might have been allocated by
1022 		 * vm_fault.
1023 		 */
1024 		swap_pager_ridpages(m, count, reqpage);
1025 		m[0] = m[reqpage];
1026 		reqaddr[0] = reqaddr[reqpage];
1027 
1028 		count = 1;
1029 		reqpage = 0;
1030 	/*
1031 	 * get a swap pager clean data structure, block until we get it
1032 	 */
1033 		if (swap_pager_free.tqh_first == NULL) {
1034 			s = splbio();
1035 			if( curproc == pageproc)
1036 				(void) swap_pager_clean();
1037 			else
1038 				wakeup((caddr_t) &vm_pages_needed);
1039 			while (swap_pager_free.tqh_first == NULL) {
1040 				swap_pager_needflags |= SWAP_FREE_NEEDED;
1041 				tsleep((caddr_t)&swap_pager_free,
1042 					PVM, "swpfre", 0);
1043 				if( curproc == pageproc)
1044 					(void) swap_pager_clean();
1045 				else
1046 					wakeup((caddr_t) &vm_pages_needed);
1047 			}
1048 			splx(s);
1049 		}
1050 		spc = swap_pager_free.tqh_first;
1051 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1052 		kva = spc->spc_kva;
1053 		bp = spc->spc_bp;
1054 		bzero(bp, sizeof *bp);
1055 		bp->b_spc = spc;
1056 	} else {
1057 	/*
1058 	 * Get a swap buffer header to perform the IO
1059 	 */
1060 		bp = getpbuf();
1061 		kva = (vm_offset_t) bp->b_data;
1062 	}
1063 
1064 	/*
1065 	 * map our page(s) into kva for input
1066 	 */
1067 	pmap_qenter( kva, m, count);
1068 
1069 	s = splbio();
1070 	bp->b_flags = B_BUSY | B_READ | B_CALL;
1071 	bp->b_iodone = swap_pager_iodone1;
1072 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1073 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1074 	crhold(bp->b_rcred);
1075 	crhold(bp->b_wcred);
1076 	bp->b_un.b_addr = (caddr_t) kva;
1077 	bp->b_blkno = reqaddr[0];
1078 	bp->b_bcount = PAGE_SIZE*count;
1079 	bp->b_bufsize = PAGE_SIZE*count;
1080 
1081 	bgetvp( swapdev_vp, bp);
1082 
1083 	swp->sw_piip++;
1084 
1085 	/*
1086 	 * perform the I/O
1087 	 */
1088 	VOP_STRATEGY(bp);
1089 
1090 	/*
1091 	 * wait for the sync I/O to complete
1092 	 */
1093 	while ((bp->b_flags & B_DONE) == 0) {
1094 		tsleep((caddr_t)bp, PVM, "swread", 0);
1095 	}
1096 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1097 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1098 
1099 	--swp->sw_piip;
1100 	if (swp->sw_piip == 0)
1101 		wakeup((caddr_t) swp);
1102 
1103 	/*
1104 	 * relpbuf does this, but we maintain our own buffer
1105 	 * list also...
1106 	 */
1107 	if (bp->b_vp)
1108 		brelvp(bp);
1109 
1110 	splx(s);
1111 	--swb[reqpage]->swb_locked;
1112 
1113 	/*
1114 	 * remove the mapping for kernel virtual
1115 	 */
1116 	pmap_qremove( kva, count);
1117 
1118 	if (spc) {
1119 		/*
1120 		 * if we have used an spc, we need to free it.
1121 		 */
1122 		if( bp->b_rcred != NOCRED)
1123 			crfree(bp->b_rcred);
1124 		if( bp->b_wcred != NOCRED)
1125 			crfree(bp->b_wcred);
1126 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1127 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1128 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1129 			wakeup((caddr_t)&swap_pager_free);
1130 		}
1131 	} else {
1132 		/*
1133 		 * release the physical I/O buffer
1134 		 */
1135 		relpbuf(bp);
1136 		/*
1137 		 * finish up input if everything is ok
1138 		 */
1139 		if( rv == VM_PAGER_OK) {
1140 			for (i = 0; i < count; i++) {
1141 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1142 				m[i]->flags |= PG_CLEAN;
1143 				m[i]->flags &= ~PG_LAUNDRY;
1144 				if (i != reqpage) {
1145 					/*
1146 					 * whether or not to leave the page activated
1147 					 * is up in the air, but we should put the page
1148 					 * on a page queue somewhere. (it already is in
1149 					 * the object).
1150 					 * After some emperical results, it is best
1151 					 * to deactivate the readahead pages.
1152 					 */
1153 					vm_page_deactivate(m[i]);
1154 
1155 					/*
1156 					 * just in case someone was asking for this
1157 					 * page we now tell them that it is ok to use
1158 					 */
1159 					m[i]->flags &= ~PG_FAKE;
1160 					PAGE_WAKEUP(m[i]);
1161 				}
1162 			}
1163 			if( swap_pager_full) {
1164 				_swap_pager_freespace( swp, m[0]->offset+paging_offset, count*PAGE_SIZE);
1165 			}
1166 		} else {
1167 			swap_pager_ridpages(m, count, reqpage);
1168 		}
1169 	}
1170 	return(rv);
1171 }
1172 
1173 int
1174 swap_pager_output(swp, m, count, flags, rtvals)
1175 	register sw_pager_t swp;
1176 	vm_page_t *m;
1177 	int count;
1178 	int flags;
1179 	int *rtvals;
1180 {
1181 	register struct buf *bp;
1182 	sw_blk_t swb[count];
1183 	register int s;
1184 	int i, j, ix;
1185 	boolean_t rv;
1186 	vm_offset_t kva, off, foff;
1187 	swp_clean_t spc;
1188 	vm_offset_t paging_offset;
1189 	vm_object_t object;
1190 	int reqaddr[count];
1191 	int failed;
1192 
1193 /*
1194 	if( count > 1)
1195 		printf("off: 0x%x, count: %d\n", m[0]->offset, count);
1196 */
1197 	spc = NULL;
1198 
1199 	object = m[0]->object;
1200 	paging_offset = object->paging_offset;
1201 
1202 	failed = 0;
1203 	for(j=0;j<count;j++) {
1204 		foff = m[j]->offset + paging_offset;
1205 		ix = swap_pager_block_index(swp, foff);
1206 		swb[j] = 0;
1207 		if( swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
1208 			rtvals[j] = VM_PAGER_FAIL;
1209 			failed = 1;
1210 			continue;
1211 		} else {
1212 			rtvals[j] = VM_PAGER_OK;
1213 		}
1214 		swb[j] = &swp->sw_blocks[ix];
1215 		++swb[j]->swb_locked;
1216 		if( failed) {
1217 			rtvals[j] = VM_PAGER_FAIL;
1218 			continue;
1219 		}
1220 		off = swap_pager_block_offset(swp, foff);
1221 		reqaddr[j] = swb[j]->swb_block[off];
1222 		if( reqaddr[j] == SWB_EMPTY) {
1223 			int blk;
1224 			int tries;
1225 			int ntoget;
1226 			tries = 0;
1227 			s = splbio();
1228 
1229 			/*
1230 			 * if any other pages have been allocated in this block, we
1231 			 * only try to get one page.
1232 			 */
1233 			for (i = 0; i < SWB_NPAGES; i++) {
1234 				if (swb[j]->swb_block[i] != SWB_EMPTY)
1235 					break;
1236 			}
1237 
1238 
1239 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
1240 			/*
1241 			 * this code is alittle conservative, but works
1242 			 * (the intent of this code is to allocate small chunks
1243 			 *  for small objects)
1244 			 */
1245 			if( (m[j]->offset == 0) && (ntoget*PAGE_SIZE > object->size)) {
1246 				ntoget = (object->size + (PAGE_SIZE-1))/PAGE_SIZE;
1247 			}
1248 
1249 retrygetspace:
1250 			if (!swap_pager_full && ntoget > 1 &&
1251 				swap_pager_getswapspace(ntoget * btodb(PAGE_SIZE), &blk)) {
1252 
1253 				for (i = 0; i < ntoget; i++) {
1254 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
1255 					swb[j]->swb_valid = 0;
1256 				}
1257 
1258 				reqaddr[j] = swb[j]->swb_block[off];
1259 			} else if (!swap_pager_getswapspace(btodb(PAGE_SIZE),
1260 				&swb[j]->swb_block[off])) {
1261 				/*
1262 				 * if the allocation has failed, we try to reclaim space and
1263 				 * retry.
1264 				 */
1265 				if (++tries == 1) {
1266 					swap_pager_reclaim();
1267 					goto retrygetspace;
1268 				}
1269 				rtvals[j] = VM_PAGER_AGAIN;
1270 				failed = 1;
1271 			} else {
1272 				reqaddr[j] = swb[j]->swb_block[off];
1273 				swb[j]->swb_valid &= ~(1<<off);
1274 			}
1275 			splx(s);
1276 		}
1277 	}
1278 
1279 	/*
1280 	 * search forwards for the last contiguous page to transfer
1281 	 */
1282 	failed = 0;
1283 	for (i = 0; i < count; i++) {
1284 		if( failed || (reqaddr[i] != reqaddr[0] + i*btodb(PAGE_SIZE)) ||
1285 			(reqaddr[i] / dmmax) != (reqaddr[0] / dmmax) ||
1286 			(rtvals[i] != VM_PAGER_OK)) {
1287 			failed = 1;
1288 			if( rtvals[i] == VM_PAGER_OK)
1289 				rtvals[i] = VM_PAGER_AGAIN;
1290 		}
1291 	}
1292 
1293 	for(i = 0; i < count; i++) {
1294 		if( rtvals[i] != VM_PAGER_OK) {
1295 			if( swb[i])
1296 				--swb[i]->swb_locked;
1297 		}
1298 	}
1299 
1300 	for(i = 0; i < count; i++)
1301 		if( rtvals[i] != VM_PAGER_OK)
1302 			break;
1303 
1304 	if( i == 0) {
1305 		return VM_PAGER_AGAIN;
1306 	}
1307 
1308 	count = i;
1309 	for(i=0;i<count;i++) {
1310 		if( reqaddr[i] == SWB_EMPTY)
1311 			printf("I/O to empty block????\n");
1312 	}
1313 
1314 	/*
1315 	 */
1316 
1317 	/*
1318 	 * For synchronous writes, we clean up
1319 	 * all completed async pageouts.
1320 	 */
1321 	if ((flags & B_ASYNC) == 0) {
1322 		swap_pager_clean();
1323 	}
1324 
1325 	kva = 0;
1326 
1327 	/*
1328 	 * we allocate a new kva for transfers > 1 page
1329 	 * but for transfers == 1 page, the swap_pager_free list contains
1330 	 * entries that have pre-allocated kva's (for efficiency).
1331 	 * NOTE -- we do not use the physical buffer pool or the
1332 	 * preallocated associated kva's because of the potential for
1333 	 * deadlock.  This is very subtile -- but deadlocks or resource
1334 	 * contention must be avoided on pageouts -- or your system will
1335 	 * sleep (forever) !!!
1336 	 */
1337 /*
1338 	if ( count > 1) {
1339 		kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE);
1340 		if( !kva) {
1341 			for (i = 0; i < count; i++) {
1342 				if( swb[i])
1343 					--swb[i]->swb_locked;
1344 				rtvals[i] = VM_PAGER_AGAIN;
1345 			}
1346 			return VM_PAGER_AGAIN;
1347 		}
1348 	}
1349 */
1350 
1351 	/*
1352 	 * get a swap pager clean data structure, block until we get it
1353 	 */
1354 	if (swap_pager_free.tqh_first == NULL) {
1355 		s = splbio();
1356 		if( curproc == pageproc)
1357 			(void) swap_pager_clean();
1358 		else
1359 			wakeup((caddr_t) &vm_pages_needed);
1360 		while (swap_pager_free.tqh_first == NULL) {
1361 			swap_pager_needflags |= SWAP_FREE_NEEDED;
1362 			tsleep((caddr_t)&swap_pager_free,
1363 				PVM, "swpfre", 0);
1364 			if( curproc == pageproc)
1365 				(void) swap_pager_clean();
1366 			else
1367 				wakeup((caddr_t) &vm_pages_needed);
1368 		}
1369 		splx(s);
1370 	}
1371 
1372 	spc = swap_pager_free.tqh_first;
1373 	TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1374 
1375 	kva = spc->spc_kva;
1376 
1377 	/*
1378 	 * map our page(s) into kva for I/O
1379 	 */
1380 	pmap_qenter(kva, m, count);
1381 
1382 	/*
1383 	 * get the base I/O offset into the swap file
1384 	 */
1385 	for(i=0;i<count;i++) {
1386 		foff = m[i]->offset + paging_offset;
1387 		off = swap_pager_block_offset(swp, foff);
1388 		/*
1389 		 * if we are setting the valid bit anew,
1390 		 * then diminish the swap free space
1391 		 */
1392 		if( (swb[i]->swb_valid & (1 << off)) == 0)
1393 			vm_swap_size -= btodb(PAGE_SIZE);
1394 
1395 		/*
1396 		 * set the valid bit
1397 		 */
1398 		swb[i]->swb_valid |= (1 << off);
1399 		/*
1400 		 * and unlock the data structure
1401 		 */
1402 		--swb[i]->swb_locked;
1403 	}
1404 
1405 	s = splbio();
1406 	/*
1407 	 * Get a swap buffer header and perform the IO
1408 	 */
1409 	bp = spc->spc_bp;
1410 	bzero(bp, sizeof *bp);
1411 	bp->b_spc = spc;
1412 
1413 	bp->b_flags = B_BUSY;
1414 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1415 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1416 	if( bp->b_rcred != NOCRED)
1417 		crhold(bp->b_rcred);
1418 	if( bp->b_wcred != NOCRED)
1419 		crhold(bp->b_wcred);
1420 	bp->b_data = (caddr_t) kva;
1421 	bp->b_blkno = reqaddr[0];
1422 	bgetvp( swapdev_vp, bp);
1423 
1424 	bp->b_bcount = PAGE_SIZE*count;
1425 	bp->b_bufsize = PAGE_SIZE*count;
1426 	swapdev_vp->v_numoutput++;
1427 
1428 	/*
1429 	 * If this is an async write we set up additional buffer fields
1430 	 * and place a "cleaning" entry on the inuse queue.
1431 	 */
1432 	if ( flags & B_ASYNC ) {
1433 		spc->spc_flags = 0;
1434 		spc->spc_swp = swp;
1435 		for(i=0;i<count;i++)
1436 			spc->spc_m[i] = m[i];
1437 		spc->spc_count = count;
1438 		/*
1439 		 * the completion routine for async writes
1440 		 */
1441 		bp->b_flags |= B_CALL;
1442 		bp->b_iodone = swap_pager_iodone;
1443 		bp->b_dirtyoff = 0;
1444 		bp->b_dirtyend = bp->b_bcount;
1445 		swp->sw_poip++;
1446 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
1447 	} else {
1448 		swp->sw_poip++;
1449 		bp->b_flags |= B_CALL;
1450 		bp->b_iodone = swap_pager_iodone1;
1451 	}
1452 	/*
1453 	 * perform the I/O
1454 	 */
1455 	VOP_STRATEGY(bp);
1456 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC ) {
1457 		if ((bp->b_flags & B_DONE) == B_DONE) {
1458 			swap_pager_clean();
1459 		}
1460 		splx(s);
1461 		for(i=0;i<count;i++) {
1462 			rtvals[i] = VM_PAGER_PEND;
1463 		}
1464 		return VM_PAGER_PEND;
1465 	}
1466 
1467 	/*
1468 	 * wait for the sync I/O to complete
1469 	 */
1470 	while ((bp->b_flags & B_DONE) == 0) {
1471 		tsleep((caddr_t)bp, PVM, "swwrt", 0);
1472 	}
1473 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1474 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1475 
1476 	--swp->sw_poip;
1477 	if (swp->sw_poip == 0)
1478 		wakeup((caddr_t) swp);
1479 
1480 	if (bp->b_vp)
1481 		brelvp(bp);
1482 
1483 	splx(s);
1484 
1485 	/*
1486 	 * remove the mapping for kernel virtual
1487 	 */
1488 	pmap_qremove( kva, count);
1489 
1490 	/*
1491 	 * if we have written the page, then indicate that the page
1492 	 * is clean.
1493 	 */
1494 	if (rv == VM_PAGER_OK) {
1495 		for(i=0;i<count;i++) {
1496 			if( rtvals[i] == VM_PAGER_OK) {
1497 				m[i]->flags |= PG_CLEAN;
1498 				m[i]->flags &= ~PG_LAUNDRY;
1499 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1500 				/*
1501 				 * optimization, if a page has been read during the
1502 				 * pageout process, we activate it.
1503 				 */
1504 				if ( (m[i]->flags & PG_ACTIVE) == 0 &&
1505 					pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))
1506 					vm_page_activate(m[i]);
1507 			}
1508 		}
1509 	} else {
1510 		for(i=0;i<count;i++) {
1511 			rtvals[i] = rv;
1512 			m[i]->flags |= PG_LAUNDRY;
1513 		}
1514 	}
1515 
1516 	if( bp->b_rcred != NOCRED)
1517 		crfree(bp->b_rcred);
1518 	if( bp->b_wcred != NOCRED)
1519 		crfree(bp->b_wcred);
1520 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1521 	if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1522 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1523 		wakeup((caddr_t)&swap_pager_free);
1524 	}
1525 
1526 	return(rv);
1527 }
1528 
1529 boolean_t
1530 swap_pager_clean()
1531 {
1532 	register swp_clean_t spc, tspc;
1533 	register int s;
1534 
1535 	tspc = NULL;
1536 	if (swap_pager_done.tqh_first == NULL)
1537 		return FALSE;
1538 	for (;;) {
1539 		s = splbio();
1540 		/*
1541 		 * Look up and removal from done list must be done
1542 		 * at splbio() to avoid conflicts with swap_pager_iodone.
1543 		 */
1544 		while ((spc = swap_pager_done.tqh_first) != 0) {
1545 			pmap_qremove( spc->spc_kva, spc->spc_count);
1546 			swap_pager_finish(spc);
1547 			TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
1548 			goto doclean;
1549 		}
1550 
1551 		/*
1552 		 * No operations done, thats all we can do for now.
1553 		 */
1554 
1555 		splx(s);
1556 		break;
1557 
1558 		/*
1559 		 * The desired page was found to be busy earlier in
1560 		 * the scan but has since completed.
1561 		 */
1562 doclean:
1563 		if (tspc && tspc == spc) {
1564 			tspc = NULL;
1565 		}
1566 		spc->spc_flags = 0;
1567 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1568 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1569 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1570 			wakeup((caddr_t)&swap_pager_free);
1571 		}
1572 		++cleandone;
1573 		splx(s);
1574 	}
1575 
1576 	return(tspc ? TRUE : FALSE);
1577 }
1578 
1579 void
1580 swap_pager_finish(spc)
1581 	register swp_clean_t spc;
1582 {
1583 	vm_object_t object = spc->spc_m[0]->object;
1584 	int i;
1585 
1586 	if ((object->paging_in_progress -= spc->spc_count) == 0)
1587 		thread_wakeup((int) object);
1588 
1589 	/*
1590 	 * If no error mark as clean and inform the pmap system.
1591 	 * If error, mark as dirty so we will try again.
1592 	 * (XXX could get stuck doing this, should give up after awhile)
1593 	 */
1594 	if (spc->spc_flags & SPC_ERROR) {
1595 		for(i=0;i<spc->spc_count;i++) {
1596 			printf("swap_pager_finish: clean of page %lx failed\n",
1597 			       (u_long)VM_PAGE_TO_PHYS(spc->spc_m[i]));
1598 			spc->spc_m[i]->flags |= PG_LAUNDRY;
1599 		}
1600 	} else {
1601 		for(i=0;i<spc->spc_count;i++) {
1602 			pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i]));
1603 			spc->spc_m[i]->flags |= PG_CLEAN;
1604 		}
1605 	}
1606 
1607 
1608 	for(i=0;i<spc->spc_count;i++) {
1609 		/*
1610 		 * we wakeup any processes that are waiting on
1611 		 * these pages.
1612 		 */
1613 		PAGE_WAKEUP(spc->spc_m[i]);
1614 	}
1615 	nswiodone -= spc->spc_count;
1616 
1617 	return;
1618 }
1619 
1620 /*
1621  * swap_pager_iodone
1622  */
1623 void
1624 swap_pager_iodone(bp)
1625 	register struct buf *bp;
1626 {
1627 	register swp_clean_t spc;
1628 	int s;
1629 
1630 	s = splbio();
1631 	spc = (swp_clean_t) bp->b_spc;
1632 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
1633 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
1634 	if (bp->b_flags & B_ERROR) {
1635 		spc->spc_flags |= SPC_ERROR;
1636 		printf("error %d blkno %lu sz %ld ",
1637 			bp->b_error, (u_long)bp->b_blkno, bp->b_bcount);
1638 	}
1639 
1640 /*
1641 	if ((bp->b_flags & B_READ) == 0)
1642 		vwakeup(bp);
1643 */
1644 
1645 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_ASYNC);
1646 	if (bp->b_vp) {
1647 		brelvp(bp);
1648 	}
1649 	if( bp->b_rcred != NOCRED)
1650 		crfree(bp->b_rcred);
1651 	if( bp->b_wcred != NOCRED)
1652 		crfree(bp->b_wcred);
1653 
1654 	nswiodone += spc->spc_count;
1655 	if (--spc->spc_swp->sw_poip == 0) {
1656 		wakeup((caddr_t)spc->spc_swp);
1657 	}
1658 
1659 	if ((swap_pager_needflags & SWAP_FREE_NEEDED) ||
1660 	    swap_pager_inuse.tqh_first == 0) {
1661 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1662 		wakeup((caddr_t)&swap_pager_free);
1663 		wakeup((caddr_t)&vm_pages_needed);
1664 	}
1665 
1666 	if (vm_pageout_pages_needed) {
1667 		wakeup((caddr_t)&vm_pageout_pages_needed);
1668 	}
1669 
1670 	if ((swap_pager_inuse.tqh_first == NULL) ||
1671 	    (cnt.v_free_count < cnt.v_free_min &&
1672 	    nswiodone + cnt.v_free_count >= cnt.v_free_min) ) {
1673 		wakeup((caddr_t)&vm_pages_needed);
1674 	}
1675 	splx(s);
1676 }
1677 
1678 /*
1679  * return true if any swap control structures can be allocated
1680  */
1681 int
1682 swap_pager_ready() {
1683 	if( swap_pager_free.tqh_first)
1684 		return 1;
1685 	else
1686 		return 0;
1687 }
1688