xref: /freebsd/sys/vm/swap_pager.c (revision 16f62314cdee3347833cdcbe2f2a8fbacea1e5b5)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * Copyright (c) 1990 University of Utah.
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * the Systems Programming Group of the University of Utah Computer
9  * Science Department.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
40  *
41  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
42  * $Id: swap_pager.c,v 1.4 1994/08/02 07:55:13 davidg Exp $
43  */
44 
45 /*
46  * Quick hack to page to dedicated partition(s).
47  * TODO:
48  *	Add multiprocessor locks
49  *	Deal with async writes in a better fashion
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/proc.h>
55 #include <sys/buf.h>
56 #include <sys/vnode.h>
57 #include <sys/malloc.h>
58 
59 #include <miscfs/specfs/specdev.h>
60 #include <sys/rlist.h>
61 
62 #include <vm/vm.h>
63 #include <vm/vm_pager.h>
64 #include <vm/vm_page.h>
65 #include <vm/vm_pageout.h>
66 #include <vm/swap_pager.h>
67 
68 #ifndef NPENDINGIO
69 #define NPENDINGIO	16
70 #endif
71 
72 extern int nswbuf;
73 int nswiodone;
74 extern int vm_pageout_rate_limit;
75 static int cleandone;
76 extern int hz;
77 int swap_pager_full;
78 extern vm_map_t pager_map;
79 extern int vm_pageout_pages_needed;
80 extern int vm_swap_size;
81 extern struct vnode *swapdev_vp;
82 
83 #define MAX_PAGEOUT_CLUSTER 8
84 
85 TAILQ_HEAD(swpclean, swpagerclean);
86 
87 typedef	struct swpagerclean	*swp_clean_t;
88 
89 struct swpagerclean {
90 	TAILQ_ENTRY(swpagerclean)	spc_list;
91 	int				spc_flags;
92 	struct buf			*spc_bp;
93 	sw_pager_t			spc_swp;
94 	vm_offset_t			spc_kva;
95 	vm_offset_t			spc_altkva;
96 	int				spc_count;
97 	vm_page_t			spc_m[MAX_PAGEOUT_CLUSTER];
98 } swcleanlist [NPENDINGIO] ;
99 
100 
101 extern vm_map_t kernel_map;
102 
103 /* spc_flags values */
104 #define SPC_ERROR	0x01
105 
106 #define SWB_EMPTY (-1)
107 
108 void		swap_pager_init(void);
109 vm_pager_t	swap_pager_alloc(caddr_t, vm_size_t, vm_prot_t, vm_offset_t);
110 void		swap_pager_dealloc(vm_pager_t);
111 boolean_t	swap_pager_getpage(vm_pager_t, vm_page_t, boolean_t);
112 boolean_t	swap_pager_putpage(vm_pager_t, vm_page_t, boolean_t);
113 boolean_t	swap_pager_getmulti(vm_pager_t, vm_page_t *, int, int, boolean_t);
114 boolean_t	swap_pager_haspage(vm_pager_t, vm_offset_t);
115 int		swap_pager_io(sw_pager_t, vm_page_t *, int, int, int);
116 void		swap_pager_iodone(struct buf *);
117 boolean_t	swap_pager_clean();
118 
119 extern struct pagerops swappagerops;
120 
121 struct swpclean swap_pager_done;	/* list of compileted page cleans */
122 struct swpclean swap_pager_inuse;	/* list of pending page cleans */
123 struct swpclean swap_pager_free;	/* list of free pager clean structs */
124 struct pagerlst swap_pager_list;	/* list of "named" anon regions */
125 struct pagerlst swap_pager_un_list;	/* list of "unnamed" anon pagers */
126 
127 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
128 int swap_pager_needflags;
129 struct rlist *swapfrag;
130 
131 struct pagerlst *swp_qs[]={
132 	&swap_pager_list, &swap_pager_un_list, (struct pagerlst *) 0
133 };
134 
135 int swap_pager_putmulti();
136 
137 struct pagerops swappagerops = {
138 	swap_pager_init,
139 	swap_pager_alloc,
140 	swap_pager_dealloc,
141 	swap_pager_getpage,
142 	swap_pager_getmulti,
143 	swap_pager_putpage,
144 	swap_pager_putmulti,
145 	swap_pager_haspage
146 };
147 
148 extern int nswbuf;
149 
150 int npendingio = NPENDINGIO;
151 int pendingiowait;
152 int require_swap_init;
153 void swap_pager_finish();
154 int dmmin, dmmax;
155 extern int vm_page_count;
156 
157 struct buf * getpbuf() ;
158 void relpbuf(struct buf *bp) ;
159 
160 static inline void swapsizecheck() {
161 	if( vm_swap_size < 128*btodb(PAGE_SIZE)) {
162 		if( swap_pager_full)
163 			printf("swap_pager: out of space\n");
164 		swap_pager_full = 1;
165 	} else if( vm_swap_size > 192*btodb(PAGE_SIZE))
166 		swap_pager_full = 0;
167 }
168 
169 void
170 swap_pager_init()
171 {
172 	extern int dmmin, dmmax;
173 
174 	dfltpagerops = &swappagerops;
175 
176 	TAILQ_INIT(&swap_pager_list);
177 	TAILQ_INIT(&swap_pager_un_list);
178 
179 	/*
180 	 * Initialize clean lists
181 	 */
182 	TAILQ_INIT(&swap_pager_inuse);
183 	TAILQ_INIT(&swap_pager_done);
184 	TAILQ_INIT(&swap_pager_free);
185 
186 	require_swap_init = 1;
187 
188 	/*
189 	 * Calculate the swap allocation constants.
190 	 */
191 
192 	dmmin = CLBYTES/DEV_BSIZE;
193 	dmmax = btodb(SWB_NPAGES*PAGE_SIZE)*2;
194 
195 }
196 
197 /*
198  * Allocate a pager structure and associated resources.
199  * Note that if we are called from the pageout daemon (handle == NULL)
200  * we should not wait for memory as it could resulting in deadlock.
201  */
202 vm_pager_t
203 swap_pager_alloc(handle, size, prot, offset)
204 	caddr_t handle;
205 	register vm_size_t size;
206 	vm_prot_t prot;
207 	vm_offset_t offset;
208 {
209 	register vm_pager_t pager;
210 	register sw_pager_t swp;
211 	int waitok;
212 	int i,j;
213 
214 	if (require_swap_init) {
215 		swp_clean_t spc;
216 		struct buf *bp;
217 		/*
218 		 * kva's are allocated here so that we dont need to keep
219 		 * doing kmem_alloc pageables at runtime
220 		 */
221 		for (i = 0, spc = swcleanlist; i < npendingio ; i++, spc++) {
222 			spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE);
223 			if (!spc->spc_kva) {
224 				break;
225 			}
226 			spc->spc_bp = malloc( sizeof( *bp), M_TEMP, M_NOWAIT);
227 			if (!spc->spc_bp) {
228 				kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
229 				break;
230 			}
231 			spc->spc_flags = 0;
232 			TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
233 		}
234 		require_swap_init = 0;
235 		if( size == 0)
236 			return(NULL);
237 	}
238 
239 	/*
240 	 * If this is a "named" anonymous region, look it up and
241 	 * return the appropriate pager if it exists.
242 	 */
243 	if (handle) {
244 		pager = vm_pager_lookup(&swap_pager_list, handle);
245 		if (pager != NULL) {
246 			/*
247 			 * Use vm_object_lookup to gain a reference
248 			 * to the object and also to remove from the
249 			 * object cache.
250 			 */
251 			if (vm_object_lookup(pager) == NULL)
252 				panic("swap_pager_alloc: bad object");
253 			return(pager);
254 		}
255 	}
256 
257 	if (swap_pager_full) {
258 		return(NULL);
259 	}
260 
261 	/*
262 	 * Pager doesn't exist, allocate swap management resources
263 	 * and initialize.
264 	 */
265 	waitok = handle ? M_WAITOK : M_NOWAIT;
266 	pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok);
267 	if (pager == NULL)
268 		return(NULL);
269 	swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok);
270 	if (swp == NULL) {
271 		free((caddr_t)pager, M_VMPAGER);
272 		return(NULL);
273 	}
274 	size = round_page(size);
275 	swp->sw_osize = size;
276 	swp->sw_nblocks = (btodb(size) + btodb(SWB_NPAGES * PAGE_SIZE) - 1) / btodb(SWB_NPAGES*PAGE_SIZE);
277 	swp->sw_blocks = (sw_blk_t)
278 		malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks),
279 		       M_VMPGDATA, waitok);
280 	if (swp->sw_blocks == NULL) {
281 		free((caddr_t)swp, M_VMPGDATA);
282 		free((caddr_t)pager, M_VMPAGER);
283 		return(NULL);
284 	}
285 
286 	for (i = 0; i < swp->sw_nblocks; i++) {
287 		swp->sw_blocks[i].swb_valid = 0;
288 		swp->sw_blocks[i].swb_locked = 0;
289 		for (j = 0; j < SWB_NPAGES; j++)
290 			swp->sw_blocks[i].swb_block[j] = SWB_EMPTY;
291 	}
292 
293 	swp->sw_poip = 0;
294 	if (handle) {
295 		vm_object_t object;
296 
297 		swp->sw_flags = SW_NAMED;
298 		TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list);
299 		/*
300 		 * Consistant with other pagers: return with object
301 		 * referenced.  Can't do this with handle == NULL
302 		 * since it might be the pageout daemon calling.
303 		 */
304 		object = vm_object_allocate(size);
305 		vm_object_enter(object, pager);
306 		vm_object_setpager(object, pager, 0, FALSE);
307 	} else {
308 		swp->sw_flags = 0;
309 		TAILQ_INSERT_TAIL(&swap_pager_un_list, pager, pg_list);
310 	}
311 	pager->pg_handle = handle;
312 	pager->pg_ops = &swappagerops;
313 	pager->pg_type = PG_SWAP;
314 	pager->pg_data = (caddr_t)swp;
315 
316 	return(pager);
317 }
318 
319 /*
320  * returns disk block associated with pager and offset
321  * additionally, as a side effect returns a flag indicating
322  * if the block has been written
323  */
324 
325 static int *
326 swap_pager_diskaddr(swp, offset, valid)
327 	sw_pager_t swp;
328 	vm_offset_t offset;
329 	int *valid;
330 {
331 	register sw_blk_t swb;
332 	int ix;
333 
334 	if (valid)
335 		*valid = 0;
336 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
337 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
338 		return(FALSE);
339 	}
340 	swb = &swp->sw_blocks[ix];
341 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
342 	if (valid)
343 		*valid = swb->swb_valid & (1<<ix);
344 	return &swb->swb_block[ix];
345 }
346 
347 /*
348  * Utility routine to set the valid (written) bit for
349  * a block associated with a pager and offset
350  */
351 static void
352 swap_pager_setvalid(swp, offset, valid)
353 	sw_pager_t swp;
354 	vm_offset_t offset;
355 	int valid;
356 {
357 	register sw_blk_t swb;
358 	int ix;
359 
360 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
361 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks)
362 		return;
363 
364 	swb = &swp->sw_blocks[ix];
365 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
366 	if (valid)
367 		swb->swb_valid |= (1 << ix);
368 	else
369 		swb->swb_valid &= ~(1 << ix);
370 	return;
371 }
372 
373 /*
374  * this routine allocates swap space with a fragmentation
375  * minimization policy.
376  */
377 int
378 swap_pager_getswapspace( unsigned amount, unsigned *rtval) {
379 	unsigned tmpalloc;
380 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
381 	if( amount < nblocksfrag) {
382 		if( rlist_alloc(&swapfrag, amount, rtval))
383 			return 1;
384 		if( !rlist_alloc(&swapmap, nblocksfrag, &tmpalloc))
385 			return 0;
386 		rlist_free( &swapfrag, tmpalloc+amount, tmpalloc + nblocksfrag - 1);
387 		*rtval = tmpalloc;
388 		return 1;
389 	}
390 	if( !rlist_alloc(&swapmap, amount, rtval))
391 		return 0;
392 	else
393 		return 1;
394 }
395 
396 /*
397  * this routine frees swap space with a fragmentation
398  * minimization policy.
399  */
400 void
401 swap_pager_freeswapspace( unsigned from, unsigned to) {
402 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
403 	unsigned tmpalloc;
404 	if( ((to + 1) - from) >= nblocksfrag) {
405 		while( (from + nblocksfrag) <= to + 1) {
406 			rlist_free(&swapmap, from, from + nblocksfrag - 1);
407 			from += nblocksfrag;
408 		}
409 	}
410 	if( from >= to)
411 		return;
412 	rlist_free(&swapfrag, from, to);
413 	while( rlist_alloc(&swapfrag, nblocksfrag, &tmpalloc)) {
414 		rlist_free(&swapmap, tmpalloc, tmpalloc + nblocksfrag-1);
415 	}
416 }
417 /*
418  * this routine frees swap blocks from a specified pager
419  */
420 void
421 _swap_pager_freespace(swp, start, size)
422 	sw_pager_t swp;
423 	vm_offset_t start;
424 	vm_offset_t size;
425 {
426 	vm_offset_t i;
427 	int s;
428 
429 	s = splbio();
430 	for (i = start; i < round_page(start + size - 1); i += PAGE_SIZE) {
431 		int valid;
432 		int *addr = swap_pager_diskaddr(swp, i, &valid);
433 		if (addr && *addr != SWB_EMPTY) {
434 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
435 			if( valid) {
436 				vm_swap_size += btodb(PAGE_SIZE);
437 				swap_pager_setvalid(swp, i, 0);
438 			}
439 			*addr = SWB_EMPTY;
440 		}
441 	}
442 	swapsizecheck();
443 	splx(s);
444 }
445 
446 void
447 swap_pager_freespace(pager, start, size)
448 	vm_pager_t pager;
449 	vm_offset_t start;
450 	vm_offset_t size;
451 {
452 	_swap_pager_freespace((sw_pager_t) pager->pg_data, start, size);
453 }
454 
455 /*
456  * swap_pager_reclaim frees up over-allocated space from all pagers
457  * this eliminates internal fragmentation due to allocation of space
458  * for segments that are never swapped to. It has been written so that
459  * it does not block until the rlist_free operation occurs; it keeps
460  * the queues consistant.
461  */
462 
463 /*
464  * Maximum number of blocks (pages) to reclaim per pass
465  */
466 #define MAXRECLAIM 256
467 
468 void
469 swap_pager_reclaim()
470 {
471 	vm_pager_t p;
472 	sw_pager_t swp;
473 	int i, j, k;
474 	int s;
475 	int reclaimcount;
476 	static int reclaims[MAXRECLAIM];
477 	static int in_reclaim;
478 
479 /*
480  * allow only one process to be in the swap_pager_reclaim subroutine
481  */
482 	s = splbio();
483 	if (in_reclaim) {
484 		tsleep((caddr_t) &in_reclaim, PSWP, "swrclm", 0);
485 		splx(s);
486 		return;
487 	}
488 	in_reclaim = 1;
489 	reclaimcount = 0;
490 
491 	/* for each pager queue */
492 	for (k = 0; swp_qs[k]; k++) {
493 
494 		p = swp_qs[k]->tqh_first;
495 		while (p && (reclaimcount < MAXRECLAIM)) {
496 
497 			/*
498 			 * see if any blocks associated with a pager has been
499 			 * allocated but not used (written)
500 			 */
501 			swp = (sw_pager_t) p->pg_data;
502 			for (i = 0; i < swp->sw_nblocks; i++) {
503 				sw_blk_t swb = &swp->sw_blocks[i];
504 				if( swb->swb_locked)
505 					continue;
506 				for (j = 0; j < SWB_NPAGES; j++) {
507 					if (swb->swb_block[j] != SWB_EMPTY &&
508 						(swb->swb_valid & (1 << j)) == 0) {
509 						reclaims[reclaimcount++] = swb->swb_block[j];
510 						swb->swb_block[j] = SWB_EMPTY;
511 						if (reclaimcount >= MAXRECLAIM)
512 							goto rfinished;
513 					}
514 				}
515 			}
516 			p = p->pg_list.tqe_next;
517 		}
518 	}
519 
520 rfinished:
521 
522 /*
523  * free the blocks that have been added to the reclaim list
524  */
525 	for (i = 0; i < reclaimcount; i++) {
526 		swap_pager_freeswapspace(reclaims[i], reclaims[i]+btodb(PAGE_SIZE) - 1);
527 		swapsizecheck();
528 		wakeup((caddr_t) &in_reclaim);
529 	}
530 
531 	splx(s);
532 	in_reclaim = 0;
533 	wakeup((caddr_t) &in_reclaim);
534 }
535 
536 
537 /*
538  * swap_pager_copy copies blocks from one pager to another and
539  * destroys the source pager
540  */
541 
542 void
543 swap_pager_copy(srcpager, srcoffset, dstpager, dstoffset, offset)
544 	vm_pager_t srcpager;
545 	vm_offset_t srcoffset;
546 	vm_pager_t dstpager;
547 	vm_offset_t dstoffset;
548 	vm_offset_t offset;
549 {
550 	sw_pager_t srcswp, dstswp;
551 	vm_offset_t i;
552 	int s;
553 
554 	srcswp = (sw_pager_t) srcpager->pg_data;
555 	dstswp = (sw_pager_t) dstpager->pg_data;
556 
557 /*
558  * remove the source pager from the swap_pager internal queue
559  */
560 	s = splbio();
561 	if (srcswp->sw_flags & SW_NAMED) {
562 		TAILQ_REMOVE(&swap_pager_list, srcpager, pg_list);
563 		srcswp->sw_flags &= ~SW_NAMED;
564 	} else {
565 		TAILQ_REMOVE(&swap_pager_un_list, srcpager, pg_list);
566 	}
567 
568 	while (srcswp->sw_poip) {
569 		tsleep((caddr_t)srcswp, PVM, "spgout", 0);
570 	}
571 	splx(s);
572 
573 /*
574  * clean all of the pages that are currently active and finished
575  */
576 	(void) swap_pager_clean();
577 
578 	s = splbio();
579 /*
580  * clear source block before destination object
581  * (release allocated space)
582  */
583 	for (i = 0; i < offset + srcoffset; i += PAGE_SIZE) {
584 		int valid;
585 		int *addr = swap_pager_diskaddr(srcswp, i, &valid);
586 		if (addr && *addr != SWB_EMPTY) {
587 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
588 			if( valid)
589 				vm_swap_size += btodb(PAGE_SIZE);
590 			swapsizecheck();
591 			*addr = SWB_EMPTY;
592 		}
593 	}
594 /*
595  * transfer source to destination
596  */
597 	for (i = 0; i < dstswp->sw_osize; i += PAGE_SIZE) {
598 		int srcvalid, dstvalid;
599 		int *srcaddrp = swap_pager_diskaddr(srcswp, i + offset + srcoffset,
600 			&srcvalid);
601 		int *dstaddrp;
602 	/*
603 	 * see if the source has space allocated
604 	 */
605 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
606 		/*
607 		 * if the source is valid and the dest has no space, then
608 		 * copy the allocation from the srouce to the dest.
609 		 */
610 			if (srcvalid) {
611 				dstaddrp = swap_pager_diskaddr(dstswp, i + dstoffset, &dstvalid);
612 				/*
613 				 * if the dest already has a valid block, deallocate the
614 				 * source block without copying.
615 				 */
616 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
617 					swap_pager_freeswapspace(*dstaddrp, *dstaddrp+btodb(PAGE_SIZE) - 1);
618 					*dstaddrp = SWB_EMPTY;
619 				}
620 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
621 					*dstaddrp = *srcaddrp;
622 					*srcaddrp = SWB_EMPTY;
623 					swap_pager_setvalid(dstswp, i + dstoffset, 1);
624 					vm_swap_size -= btodb(PAGE_SIZE);
625 				}
626 			}
627 		/*
628 		 * if the source is not empty at this point, then deallocate the space.
629 		 */
630 			if (*srcaddrp != SWB_EMPTY) {
631 				swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
632 				if( srcvalid)
633 					vm_swap_size += btodb(PAGE_SIZE);
634 				*srcaddrp = SWB_EMPTY;
635 			}
636 		}
637 	}
638 
639 /*
640  * deallocate the rest of the source object
641  */
642 	for (i = dstswp->sw_osize + offset + srcoffset; i < srcswp->sw_osize; i += PAGE_SIZE) {
643 		int valid;
644 		int *srcaddrp = swap_pager_diskaddr(srcswp, i, &valid);
645 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
646 			swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
647 			if( valid)
648 				vm_swap_size += btodb(PAGE_SIZE);
649 			*srcaddrp = SWB_EMPTY;
650 		}
651 	}
652 
653 	swapsizecheck();
654 	splx(s);
655 
656 	free((caddr_t)srcswp->sw_blocks, M_VMPGDATA);
657 	srcswp->sw_blocks = 0;
658 	free((caddr_t)srcswp, M_VMPGDATA);
659 	srcpager->pg_data = 0;
660 	free((caddr_t)srcpager, M_VMPAGER);
661 
662 	return;
663 }
664 
665 
666 void
667 swap_pager_dealloc(pager)
668 	vm_pager_t pager;
669 {
670 	register int i,j;
671 	register sw_blk_t bp;
672 	register sw_pager_t swp;
673 	int s;
674 
675 	/*
676 	 * Remove from list right away so lookups will fail if we
677 	 * block for pageout completion.
678 	 */
679 	s = splbio();
680 	swp = (sw_pager_t) pager->pg_data;
681 	if (swp->sw_flags & SW_NAMED) {
682 		TAILQ_REMOVE(&swap_pager_list, pager, pg_list);
683 		swp->sw_flags &= ~SW_NAMED;
684 	} else {
685 		TAILQ_REMOVE(&swap_pager_un_list, pager, pg_list);
686 	}
687 	/*
688 	 * Wait for all pageouts to finish and remove
689 	 * all entries from cleaning list.
690 	 */
691 
692 	while (swp->sw_poip) {
693 		tsleep((caddr_t)swp, PVM, "swpout", 0);
694 	}
695 	splx(s);
696 
697 
698 	(void) swap_pager_clean();
699 
700 	/*
701 	 * Free left over swap blocks
702 	 */
703 	s = splbio();
704 	for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) {
705 		for (j = 0; j < SWB_NPAGES; j++)
706 		if (bp->swb_block[j] != SWB_EMPTY) {
707 			swap_pager_freeswapspace((unsigned)bp->swb_block[j],
708 				(unsigned)bp->swb_block[j] + btodb(PAGE_SIZE) - 1);
709 			if( bp->swb_valid & (1<<j))
710 				vm_swap_size += btodb(PAGE_SIZE);
711 			bp->swb_block[j] = SWB_EMPTY;
712 		}
713 	}
714 	splx(s);
715 	swapsizecheck();
716 
717 	/*
718 	 * Free swap management resources
719 	 */
720 	free((caddr_t)swp->sw_blocks, M_VMPGDATA);
721 	swp->sw_blocks = 0;
722 	free((caddr_t)swp, M_VMPGDATA);
723 	pager->pg_data = 0;
724 	free((caddr_t)pager, M_VMPAGER);
725 }
726 
727 /*
728  * swap_pager_getmulti can get multiple pages.
729  */
730 int
731 swap_pager_getmulti(pager, m, count, reqpage, sync)
732 	vm_pager_t pager;
733 	vm_page_t *m;
734 	int count;
735 	int reqpage;
736 	boolean_t sync;
737 {
738 	if( reqpage >= count)
739 		panic("swap_pager_getmulti: reqpage >= count\n");
740 	return swap_pager_input((sw_pager_t) pager->pg_data, m, count, reqpage);
741 }
742 
743 /*
744  * swap_pager_getpage gets individual pages
745  */
746 int
747 swap_pager_getpage(pager, m, sync)
748 	vm_pager_t pager;
749 	vm_page_t m;
750 	boolean_t sync;
751 {
752 	vm_page_t marray[1];
753 
754 	marray[0] = m;
755 	return swap_pager_input((sw_pager_t)pager->pg_data, marray, 1, 0);
756 }
757 
758 int
759 swap_pager_putmulti(pager, m, c, sync, rtvals)
760 	vm_pager_t pager;
761 	vm_page_t *m;
762 	int c;
763 	boolean_t sync;
764 	int *rtvals;
765 {
766 	int flags;
767 
768 	if (pager == NULL) {
769 		(void) swap_pager_clean();
770 		return VM_PAGER_OK;
771 	}
772 
773 	flags = B_WRITE;
774 	if (!sync)
775 		flags |= B_ASYNC;
776 
777 	return swap_pager_output((sw_pager_t)pager->pg_data, m, c, flags, rtvals);
778 }
779 
780 /*
781  * swap_pager_putpage writes individual pages
782  */
783 int
784 swap_pager_putpage(pager, m, sync)
785 	vm_pager_t pager;
786 	vm_page_t m;
787 	boolean_t sync;
788 {
789 	int flags;
790 	vm_page_t marray[1];
791 	int rtvals[1];
792 
793 
794 	if (pager == NULL) {
795 		(void) swap_pager_clean();
796 		return VM_PAGER_OK;
797 	}
798 
799 	marray[0] = m;
800 	flags = B_WRITE;
801 	if (!sync)
802 		flags |= B_ASYNC;
803 
804 	swap_pager_output((sw_pager_t)pager->pg_data, marray, 1, flags, rtvals);
805 
806 	return rtvals[0];
807 }
808 
809 static inline int
810 const swap_pager_block_index(swp, offset)
811 	sw_pager_t swp;
812 	vm_offset_t offset;
813 {
814 	return (offset / (SWB_NPAGES*PAGE_SIZE));
815 }
816 
817 static inline int
818 const swap_pager_block_offset(swp, offset)
819 	sw_pager_t swp;
820 	vm_offset_t offset;
821 {
822 	return ((offset % (PAGE_SIZE*SWB_NPAGES)) / PAGE_SIZE);
823 }
824 
825 /*
826  * _swap_pager_haspage returns TRUE if the pager has data that has
827  * been written out.
828  */
829 static boolean_t
830 _swap_pager_haspage(swp, offset)
831 	sw_pager_t swp;
832 	vm_offset_t offset;
833 {
834 	register sw_blk_t swb;
835 	int ix;
836 
837 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
838 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
839 		return(FALSE);
840 	}
841 	swb = &swp->sw_blocks[ix];
842 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
843 	if (swb->swb_block[ix] != SWB_EMPTY) {
844 		if (swb->swb_valid & (1 << ix))
845 			return TRUE;
846 	}
847 
848 	return(FALSE);
849 }
850 
851 /*
852  * swap_pager_haspage is the externally accessible version of
853  * _swap_pager_haspage above.  this routine takes a vm_pager_t
854  * for an argument instead of sw_pager_t.
855  */
856 boolean_t
857 swap_pager_haspage(pager, offset)
858 	vm_pager_t pager;
859 	vm_offset_t offset;
860 {
861 	return _swap_pager_haspage((sw_pager_t) pager->pg_data, offset);
862 }
863 
864 /*
865  * swap_pager_freepage is a convienience routine that clears the busy
866  * bit and deallocates a page.
867  */
868 static void
869 swap_pager_freepage(m)
870 	vm_page_t m;
871 {
872 	PAGE_WAKEUP(m);
873 	vm_page_free(m);
874 }
875 
876 /*
877  * swap_pager_ridpages is a convienience routine that deallocates all
878  * but the required page.  this is usually used in error returns that
879  * need to invalidate the "extra" readahead pages.
880  */
881 static void
882 swap_pager_ridpages(m, count, reqpage)
883 	vm_page_t *m;
884 	int count;
885 	int reqpage;
886 {
887 	int i;
888 	for (i = 0; i < count; i++)
889 		if (i != reqpage)
890 			swap_pager_freepage(m[i]);
891 }
892 
893 int swapwritecount=0;
894 
895 /*
896  * swap_pager_iodone1 is the completion routine for both reads and async writes
897  */
898 void
899 swap_pager_iodone1(bp)
900 	struct buf *bp;
901 {
902 	bp->b_flags |= B_DONE;
903 	bp->b_flags &= ~B_ASYNC;
904 	wakeup((caddr_t)bp);
905 /*
906 	if ((bp->b_flags & B_READ) == 0)
907 		vwakeup(bp);
908 */
909 }
910 
911 
912 int
913 swap_pager_input(swp, m, count, reqpage)
914 	register sw_pager_t swp;
915 	vm_page_t *m;
916 	int count, reqpage;
917 {
918 	register struct buf *bp;
919 	sw_blk_t swb[count];
920 	register int s;
921 	int i;
922 	boolean_t rv;
923 	vm_offset_t kva, off[count];
924 	swp_clean_t spc;
925 	vm_offset_t paging_offset;
926 	vm_object_t object;
927 	int reqaddr[count];
928 
929 	int first, last;
930 	int failed;
931 	int reqdskregion;
932 
933 	object = m[reqpage]->object;
934 	paging_offset = object->paging_offset;
935 	/*
936 	 * First determine if the page exists in the pager if this is
937 	 * a sync read.  This quickly handles cases where we are
938 	 * following shadow chains looking for the top level object
939 	 * with the page.
940 	 */
941 	if (swp->sw_blocks == NULL) {
942 		swap_pager_ridpages(m, count, reqpage);
943 		return(VM_PAGER_FAIL);
944 	}
945 
946 	for(i = 0; i < count; i++) {
947 		vm_offset_t foff = m[i]->offset + paging_offset;
948 		int ix = swap_pager_block_index(swp, foff);
949 		if (ix >= swp->sw_nblocks) {
950 			int j;
951 			if( i <= reqpage) {
952 				swap_pager_ridpages(m, count, reqpage);
953 				return(VM_PAGER_FAIL);
954 			}
955 			for(j = i; j < count; j++) {
956 				swap_pager_freepage(m[j]);
957 			}
958 			count = i;
959 			break;
960 		}
961 
962 		swb[i] = &swp->sw_blocks[ix];
963 		off[i] = swap_pager_block_offset(swp, foff);
964 		reqaddr[i] = swb[i]->swb_block[off[i]];
965 	}
966 
967 	/* make sure that our required input request is existant */
968 
969 	if (reqaddr[reqpage] == SWB_EMPTY ||
970 		(swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
971 		swap_pager_ridpages(m, count, reqpage);
972 		return(VM_PAGER_FAIL);
973 	}
974 
975 
976 	reqdskregion = reqaddr[reqpage] / dmmax;
977 
978 	/*
979 	 * search backwards for the first contiguous page to transfer
980 	 */
981 	failed = 0;
982 	first = 0;
983 	for (i = reqpage - 1; i >= 0; --i) {
984 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
985 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
986 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
987 			((reqaddr[i] / dmmax) != reqdskregion)) {
988 				failed = 1;
989 				swap_pager_freepage(m[i]);
990 				if (first == 0)
991 					first = i + 1;
992 		}
993 	}
994 	/*
995 	 * search forwards for the last contiguous page to transfer
996 	 */
997 	failed = 0;
998 	last = count;
999 	for (i = reqpage + 1; i < count; i++) {
1000 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
1001 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
1002 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
1003 			((reqaddr[i] / dmmax) != reqdskregion)) {
1004 				failed = 1;
1005 				swap_pager_freepage(m[i]);
1006 				if (last == count)
1007 					last = i;
1008 		}
1009 	}
1010 
1011 	count = last;
1012 	if (first != 0) {
1013 		for (i = first; i < count; i++) {
1014 			m[i-first] = m[i];
1015 			reqaddr[i-first] = reqaddr[i];
1016 			off[i-first] = off[i];
1017 		}
1018 		count -= first;
1019 		reqpage -= first;
1020 	}
1021 
1022 	++swb[reqpage]->swb_locked;
1023 
1024 	/*
1025 	 * at this point:
1026 	 * "m" is a pointer to the array of vm_page_t for paging I/O
1027 	 * "count" is the number of vm_page_t entries represented by "m"
1028 	 * "object" is the vm_object_t for I/O
1029 	 * "reqpage" is the index into "m" for the page actually faulted
1030 	 */
1031 
1032 	spc = NULL;	/* we might not use an spc data structure */
1033 
1034 	if (count == 1) {
1035 		/*
1036 		 * if a kva has not been allocated, we can only do a one page transfer,
1037 		 * so we free the other pages that might have been allocated by
1038 		 * vm_fault.
1039 		 */
1040 		swap_pager_ridpages(m, count, reqpage);
1041 		m[0] = m[reqpage];
1042 		reqaddr[0] = reqaddr[reqpage];
1043 
1044 		count = 1;
1045 		reqpage = 0;
1046 	/*
1047 	 * get a swap pager clean data structure, block until we get it
1048 	 */
1049 		if (swap_pager_free.tqh_first == NULL) {
1050 			s = splbio();
1051 			if( curproc == pageproc)
1052 				(void) swap_pager_clean();
1053 			else
1054 				wakeup((caddr_t) &vm_pages_needed);
1055 			while (swap_pager_free.tqh_first == NULL) {
1056 				swap_pager_needflags |= SWAP_FREE_NEEDED;
1057 				tsleep((caddr_t)&swap_pager_free,
1058 					PVM, "swpfre", 0);
1059 				if( curproc == pageproc)
1060 					(void) swap_pager_clean();
1061 				else
1062 					wakeup((caddr_t) &vm_pages_needed);
1063 			}
1064 			splx(s);
1065 		}
1066 		spc = swap_pager_free.tqh_first;
1067 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1068 		kva = spc->spc_kva;
1069 		bp = spc->spc_bp;
1070 		bzero(bp, sizeof *bp);
1071 		bp->b_spc = spc;
1072 	} else {
1073 	/*
1074 	 * Get a swap buffer header to perform the IO
1075 	 */
1076 		bp = getpbuf();
1077 		kva = (vm_offset_t) bp->b_data;
1078 	}
1079 
1080 	/*
1081 	 * map our page(s) into kva for input
1082 	 */
1083 	pmap_qenter( kva, m, count);
1084 
1085 	s = splbio();
1086 	bp->b_flags = B_BUSY | B_READ | B_CALL;
1087 	bp->b_iodone = swap_pager_iodone1;
1088 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1089 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1090 	crhold(bp->b_rcred);
1091 	crhold(bp->b_wcred);
1092 	bp->b_un.b_addr = (caddr_t) kva;
1093 	bp->b_blkno = reqaddr[0];
1094 	bp->b_bcount = PAGE_SIZE*count;
1095 	bp->b_bufsize = PAGE_SIZE*count;
1096 
1097 	bgetvp( swapdev_vp, bp);
1098 
1099 	swp->sw_piip++;
1100 
1101 	/*
1102 	 * perform the I/O
1103 	 */
1104 	VOP_STRATEGY(bp);
1105 
1106 	/*
1107 	 * wait for the sync I/O to complete
1108 	 */
1109 	while ((bp->b_flags & B_DONE) == 0) {
1110 		tsleep((caddr_t)bp, PVM, "swread", 0);
1111 	}
1112 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1113 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1114 
1115 	--swp->sw_piip;
1116 	if (swp->sw_piip == 0)
1117 		wakeup((caddr_t) swp);
1118 
1119 	/*
1120 	 * relpbuf does this, but we maintain our own buffer
1121 	 * list also...
1122 	 */
1123 	if (bp->b_vp)
1124 		brelvp(bp);
1125 
1126 	splx(s);
1127 	--swb[reqpage]->swb_locked;
1128 
1129 	/*
1130 	 * remove the mapping for kernel virtual
1131 	 */
1132 	pmap_qremove( kva, count);
1133 
1134 	if (spc) {
1135 		/*
1136 		 * if we have used an spc, we need to free it.
1137 		 */
1138 		if( bp->b_rcred != NOCRED)
1139 			crfree(bp->b_rcred);
1140 		if( bp->b_wcred != NOCRED)
1141 			crfree(bp->b_wcred);
1142 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1143 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1144 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1145 			wakeup((caddr_t)&swap_pager_free);
1146 		}
1147 	} else {
1148 		/*
1149 		 * release the physical I/O buffer
1150 		 */
1151 		relpbuf(bp);
1152 		/*
1153 		 * finish up input if everything is ok
1154 		 */
1155 		if( rv == VM_PAGER_OK) {
1156 			for (i = 0; i < count; i++) {
1157 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1158 				m[i]->flags |= PG_CLEAN;
1159 				m[i]->flags &= ~PG_LAUNDRY;
1160 				if (i != reqpage) {
1161 					/*
1162 					 * whether or not to leave the page activated
1163 					 * is up in the air, but we should put the page
1164 					 * on a page queue somewhere. (it already is in
1165 					 * the object).
1166 					 * After some emperical results, it is best
1167 					 * to deactivate the readahead pages.
1168 					 */
1169 					vm_page_deactivate(m[i]);
1170 
1171 					/*
1172 					 * just in case someone was asking for this
1173 					 * page we now tell them that it is ok to use
1174 					 */
1175 					m[i]->flags &= ~PG_FAKE;
1176 					PAGE_WAKEUP(m[i]);
1177 				}
1178 			}
1179 			if( swap_pager_full) {
1180 				_swap_pager_freespace( swp, m[0]->offset+paging_offset, count*PAGE_SIZE);
1181 			}
1182 		} else {
1183 			swap_pager_ridpages(m, count, reqpage);
1184 		}
1185 	}
1186 	return(rv);
1187 }
1188 
1189 int
1190 swap_pager_output(swp, m, count, flags, rtvals)
1191 	register sw_pager_t swp;
1192 	vm_page_t *m;
1193 	int count;
1194 	int flags;
1195 	int *rtvals;
1196 {
1197 	register struct buf *bp;
1198 	sw_blk_t swb[count];
1199 	register int s;
1200 	int i, j, ix;
1201 	boolean_t rv;
1202 	vm_offset_t kva, off, foff;
1203 	swp_clean_t spc;
1204 	vm_offset_t paging_offset;
1205 	vm_object_t object;
1206 	int reqaddr[count];
1207 	int failed;
1208 
1209 /*
1210 	if( count > 1)
1211 		printf("off: 0x%x, count: %d\n", m[0]->offset, count);
1212 */
1213 	spc = NULL;
1214 
1215 	object = m[0]->object;
1216 	paging_offset = object->paging_offset;
1217 
1218 	failed = 0;
1219 	for(j=0;j<count;j++) {
1220 		foff = m[j]->offset + paging_offset;
1221 		ix = swap_pager_block_index(swp, foff);
1222 		swb[j] = 0;
1223 		if( swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
1224 			rtvals[j] = VM_PAGER_FAIL;
1225 			failed = 1;
1226 			continue;
1227 		} else {
1228 			rtvals[j] = VM_PAGER_OK;
1229 		}
1230 		swb[j] = &swp->sw_blocks[ix];
1231 		++swb[j]->swb_locked;
1232 		if( failed) {
1233 			rtvals[j] = VM_PAGER_FAIL;
1234 			continue;
1235 		}
1236 		off = swap_pager_block_offset(swp, foff);
1237 		reqaddr[j] = swb[j]->swb_block[off];
1238 		if( reqaddr[j] == SWB_EMPTY) {
1239 			int blk;
1240 			int tries;
1241 			int ntoget;
1242 			tries = 0;
1243 			s = splbio();
1244 
1245 			/*
1246 			 * if any other pages have been allocated in this block, we
1247 			 * only try to get one page.
1248 			 */
1249 			for (i = 0; i < SWB_NPAGES; i++) {
1250 				if (swb[j]->swb_block[i] != SWB_EMPTY)
1251 					break;
1252 			}
1253 
1254 
1255 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
1256 			/*
1257 			 * this code is alittle conservative, but works
1258 			 * (the intent of this code is to allocate small chunks
1259 			 *  for small objects)
1260 			 */
1261 			if( (m[j]->offset == 0) && (ntoget*PAGE_SIZE > object->size)) {
1262 				ntoget = (object->size + (PAGE_SIZE-1))/PAGE_SIZE;
1263 			}
1264 
1265 retrygetspace:
1266 			if (!swap_pager_full && ntoget > 1 &&
1267 				swap_pager_getswapspace(ntoget * btodb(PAGE_SIZE), &blk)) {
1268 
1269 				for (i = 0; i < ntoget; i++) {
1270 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
1271 					swb[j]->swb_valid = 0;
1272 				}
1273 
1274 				reqaddr[j] = swb[j]->swb_block[off];
1275 			} else if (!swap_pager_getswapspace(btodb(PAGE_SIZE),
1276 				&swb[j]->swb_block[off])) {
1277 				/*
1278 				 * if the allocation has failed, we try to reclaim space and
1279 				 * retry.
1280 				 */
1281 				if (++tries == 1) {
1282 					swap_pager_reclaim();
1283 					goto retrygetspace;
1284 				}
1285 				rtvals[j] = VM_PAGER_AGAIN;
1286 				failed = 1;
1287 			} else {
1288 				reqaddr[j] = swb[j]->swb_block[off];
1289 				swb[j]->swb_valid &= ~(1<<off);
1290 			}
1291 			splx(s);
1292 		}
1293 	}
1294 
1295 	/*
1296 	 * search forwards for the last contiguous page to transfer
1297 	 */
1298 	failed = 0;
1299 	for (i = 0; i < count; i++) {
1300 		if( failed || (reqaddr[i] != reqaddr[0] + i*btodb(PAGE_SIZE)) ||
1301 			(reqaddr[i] / dmmax) != (reqaddr[0] / dmmax) ||
1302 			(rtvals[i] != VM_PAGER_OK)) {
1303 			failed = 1;
1304 			if( rtvals[i] == VM_PAGER_OK)
1305 				rtvals[i] = VM_PAGER_AGAIN;
1306 		}
1307 	}
1308 
1309 	for(i = 0; i < count; i++) {
1310 		if( rtvals[i] != VM_PAGER_OK) {
1311 			if( swb[i])
1312 				--swb[i]->swb_locked;
1313 		}
1314 	}
1315 
1316 	for(i = 0; i < count; i++)
1317 		if( rtvals[i] != VM_PAGER_OK)
1318 			break;
1319 
1320 	if( i == 0) {
1321 		return VM_PAGER_AGAIN;
1322 	}
1323 
1324 	count = i;
1325 	for(i=0;i<count;i++) {
1326 		if( reqaddr[i] == SWB_EMPTY)
1327 			printf("I/O to empty block????\n");
1328 	}
1329 
1330 	/*
1331 	 */
1332 
1333 	/*
1334 	 * For synchronous writes, we clean up
1335 	 * all completed async pageouts.
1336 	 */
1337 	if ((flags & B_ASYNC) == 0) {
1338 		swap_pager_clean();
1339 	}
1340 
1341 	kva = 0;
1342 
1343 	/*
1344 	 * we allocate a new kva for transfers > 1 page
1345 	 * but for transfers == 1 page, the swap_pager_free list contains
1346 	 * entries that have pre-allocated kva's (for efficiency).
1347 	 * NOTE -- we do not use the physical buffer pool or the
1348 	 * preallocated associated kva's because of the potential for
1349 	 * deadlock.  This is very subtile -- but deadlocks or resource
1350 	 * contention must be avoided on pageouts -- or your system will
1351 	 * sleep (forever) !!!
1352 	 */
1353 	if ( count > 1) {
1354 		kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE);
1355 		if( !kva) {
1356 			for (i = 0; i < count; i++) {
1357 				if( swb[i])
1358 					--swb[i]->swb_locked;
1359 				rtvals[i] = VM_PAGER_AGAIN;
1360 			}
1361 			return VM_PAGER_AGAIN;
1362 		}
1363 	}
1364 
1365 	/*
1366 	 * get a swap pager clean data structure, block until we get it
1367 	 */
1368 	if (swap_pager_free.tqh_first == NULL) {
1369 /*
1370 		if (flags & B_ASYNC) {
1371 			for(i=0;i<count;i++) {
1372 				rtvals[i] = VM_PAGER_AGAIN;
1373 				if( swb[i])
1374 					--swb[i]->swb_locked;
1375 			}
1376 			return VM_PAGER_AGAIN;
1377 		}
1378 */
1379 
1380 		s = splbio();
1381 		if( curproc == pageproc)
1382 			(void) swap_pager_clean();
1383 		else
1384 			wakeup((caddr_t) &vm_pages_needed);
1385 		while (swap_pager_free.tqh_first == NULL) {
1386 			swap_pager_needflags |= SWAP_FREE_NEEDED;
1387 			tsleep((caddr_t)&swap_pager_free,
1388 				PVM, "swpfre", 0);
1389 			if( curproc == pageproc)
1390 				(void) swap_pager_clean();
1391 			else
1392 				wakeup((caddr_t) &vm_pages_needed);
1393 		}
1394 		splx(s);
1395 	}
1396 
1397 	spc = swap_pager_free.tqh_first;
1398 	TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1399 	if( !kva) {
1400 		kva = spc->spc_kva;
1401 		spc->spc_altkva = 0;
1402 	} else {
1403 		spc->spc_altkva = kva;
1404 	}
1405 
1406 	/*
1407 	 * map our page(s) into kva for I/O
1408 	 */
1409 	pmap_qenter(kva, m, count);
1410 
1411 	/*
1412 	 * get the base I/O offset into the swap file
1413 	 */
1414 	for(i=0;i<count;i++) {
1415 		foff = m[i]->offset + paging_offset;
1416 		off = swap_pager_block_offset(swp, foff);
1417 		/*
1418 		 * if we are setting the valid bit anew,
1419 		 * then diminish the swap free space
1420 		 */
1421 		if( (swb[i]->swb_valid & (1 << off)) == 0)
1422 			vm_swap_size -= btodb(PAGE_SIZE);
1423 
1424 		/*
1425 		 * set the valid bit
1426 		 */
1427 		swb[i]->swb_valid |= (1 << off);
1428 		/*
1429 		 * and unlock the data structure
1430 		 */
1431 		--swb[i]->swb_locked;
1432 	}
1433 
1434 	s = splbio();
1435 	/*
1436 	 * Get a swap buffer header and perform the IO
1437 	 */
1438 	bp = spc->spc_bp;
1439 	bzero(bp, sizeof *bp);
1440 	bp->b_spc = spc;
1441 
1442 	bp->b_flags = B_BUSY;
1443 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1444 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1445 	crhold(bp->b_rcred);
1446 	crhold(bp->b_wcred);
1447 	bp->b_un.b_addr = (caddr_t) kva;
1448 	bp->b_blkno = reqaddr[0];
1449 	bgetvp( swapdev_vp, bp);
1450 
1451 	bp->b_bcount = PAGE_SIZE*count;
1452 	bp->b_bufsize = PAGE_SIZE*count;
1453 	swapdev_vp->v_numoutput++;
1454 
1455 	/*
1456 	 * If this is an async write we set up additional buffer fields
1457 	 * and place a "cleaning" entry on the inuse queue.
1458 	 */
1459 	if ( flags & B_ASYNC ) {
1460 		spc->spc_flags = 0;
1461 		spc->spc_swp = swp;
1462 		for(i=0;i<count;i++)
1463 			spc->spc_m[i] = m[i];
1464 		spc->spc_count = count;
1465 		/*
1466 		 * the completion routine for async writes
1467 		 */
1468 		bp->b_flags |= B_CALL;
1469 		bp->b_iodone = swap_pager_iodone;
1470 		bp->b_dirtyoff = 0;
1471 		bp->b_dirtyend = bp->b_bcount;
1472 		swp->sw_poip++;
1473 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
1474 	} else {
1475 		swp->sw_poip++;
1476 		bp->b_flags |= B_CALL;
1477 		bp->b_iodone = swap_pager_iodone1;
1478 	}
1479 	/*
1480 	 * perform the I/O
1481 	 */
1482 	VOP_STRATEGY(bp);
1483 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC ) {
1484 		if ((bp->b_flags & B_DONE) == B_DONE) {
1485 			swap_pager_clean();
1486 		}
1487 		splx(s);
1488 		for(i=0;i<count;i++) {
1489 			rtvals[i] = VM_PAGER_PEND;
1490 		}
1491 		return VM_PAGER_PEND;
1492 	}
1493 
1494 	/*
1495 	 * wait for the sync I/O to complete
1496 	 */
1497 	while ((bp->b_flags & B_DONE) == 0) {
1498 		tsleep((caddr_t)bp, PVM, "swwrt", 0);
1499 	}
1500 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1501 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1502 
1503 	--swp->sw_poip;
1504 	if (swp->sw_poip == 0)
1505 		wakeup((caddr_t) swp);
1506 
1507 	if (bp->b_vp)
1508 		brelvp(bp);
1509 
1510 	splx(s);
1511 
1512 	/*
1513 	 * remove the mapping for kernel virtual
1514 	 */
1515 	pmap_qremove( kva, count);
1516 
1517 	/*
1518 	 * if we have written the page, then indicate that the page
1519 	 * is clean.
1520 	 */
1521 	if (rv == VM_PAGER_OK) {
1522 		for(i=0;i<count;i++) {
1523 			if( rtvals[i] == VM_PAGER_OK) {
1524 				m[i]->flags |= PG_CLEAN;
1525 				m[i]->flags &= ~PG_LAUNDRY;
1526 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1527 				/*
1528 				 * optimization, if a page has been read during the
1529 				 * pageout process, we activate it.
1530 				 */
1531 				if ( (m[i]->flags & PG_ACTIVE) == 0 &&
1532 					pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))
1533 					vm_page_activate(m[i]);
1534 			}
1535 		}
1536 	} else {
1537 		for(i=0;i<count;i++) {
1538 			rtvals[i] = rv;
1539 			m[i]->flags |= PG_LAUNDRY;
1540 		}
1541 	}
1542 
1543 	if( spc->spc_altkva)
1544 		kmem_free_wakeup(pager_map, kva, count * PAGE_SIZE);
1545 
1546 	if( bp->b_rcred != NOCRED)
1547 		crfree(bp->b_rcred);
1548 	if( bp->b_wcred != NOCRED)
1549 		crfree(bp->b_wcred);
1550 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1551 	if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1552 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1553 		wakeup((caddr_t)&swap_pager_free);
1554 	}
1555 
1556 	return(rv);
1557 }
1558 
1559 boolean_t
1560 swap_pager_clean()
1561 {
1562 	register swp_clean_t spc, tspc;
1563 	register int s;
1564 
1565 	tspc = NULL;
1566 	if (swap_pager_done.tqh_first == NULL)
1567 		return FALSE;
1568 	for (;;) {
1569 		s = splbio();
1570 		/*
1571 		 * Look up and removal from done list must be done
1572 		 * at splbio() to avoid conflicts with swap_pager_iodone.
1573 		 */
1574 		while (spc = swap_pager_done.tqh_first) {
1575 			if( spc->spc_altkva) {
1576 				pmap_qremove( spc->spc_altkva, spc->spc_count);
1577 				kmem_free_wakeup(pager_map, spc->spc_altkva, spc->spc_count * PAGE_SIZE);
1578 				spc->spc_altkva = 0;
1579 			} else {
1580 				pmap_qremove( spc->spc_kva, 1);
1581 			}
1582 			swap_pager_finish(spc);
1583 			TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
1584 			goto doclean;
1585 		}
1586 
1587 		/*
1588 		 * No operations done, thats all we can do for now.
1589 		 */
1590 
1591 		splx(s);
1592 		break;
1593 
1594 		/*
1595 		 * The desired page was found to be busy earlier in
1596 		 * the scan but has since completed.
1597 		 */
1598 doclean:
1599 		if (tspc && tspc == spc) {
1600 			tspc = NULL;
1601 		}
1602 		spc->spc_flags = 0;
1603 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1604 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1605 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1606 			wakeup((caddr_t)&swap_pager_free);
1607 		}
1608 		++cleandone;
1609 		splx(s);
1610 	}
1611 
1612 	return(tspc ? TRUE : FALSE);
1613 }
1614 
1615 void
1616 swap_pager_finish(spc)
1617 	register swp_clean_t spc;
1618 {
1619 	vm_object_t object = spc->spc_m[0]->object;
1620 	int i;
1621 
1622 	if ((object->paging_in_progress -= spc->spc_count) == 0)
1623 		thread_wakeup((int) object);
1624 
1625 	/*
1626 	 * If no error mark as clean and inform the pmap system.
1627 	 * If error, mark as dirty so we will try again.
1628 	 * (XXX could get stuck doing this, should give up after awhile)
1629 	 */
1630 	if (spc->spc_flags & SPC_ERROR) {
1631 		for(i=0;i<spc->spc_count;i++) {
1632 			printf("swap_pager_finish: clean of page %x failed\n",
1633 			       VM_PAGE_TO_PHYS(spc->spc_m[i]));
1634 			spc->spc_m[i]->flags |= PG_LAUNDRY;
1635 		}
1636 	} else {
1637 		for(i=0;i<spc->spc_count;i++) {
1638 			pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i]));
1639 			spc->spc_m[i]->flags |= PG_CLEAN;
1640 		}
1641 	}
1642 
1643 
1644 	for(i=0;i<spc->spc_count;i++) {
1645 		/*
1646 		 * we wakeup any processes that are waiting on
1647 		 * these pages.
1648 		 */
1649 		PAGE_WAKEUP(spc->spc_m[i]);
1650 	}
1651 	nswiodone -= spc->spc_count;
1652 
1653 	return;
1654 }
1655 
1656 /*
1657  * swap_pager_iodone
1658  */
1659 void
1660 swap_pager_iodone(bp)
1661 	register struct buf *bp;
1662 {
1663 	register swp_clean_t spc;
1664 	int s;
1665 
1666 	s = splbio();
1667 	spc = (swp_clean_t) bp->b_spc;
1668 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
1669 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
1670 	if (bp->b_flags & B_ERROR) {
1671 		spc->spc_flags |= SPC_ERROR;
1672 		printf("error %d blkno %d sz %d ",
1673 			bp->b_error, bp->b_blkno, bp->b_bcount);
1674 	}
1675 
1676 /*
1677 	if ((bp->b_flags & B_READ) == 0)
1678 		vwakeup(bp);
1679 */
1680 
1681 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_ASYNC);
1682 	if (bp->b_vp) {
1683 		brelvp(bp);
1684 	}
1685 	if( bp->b_rcred != NOCRED)
1686 		crfree(bp->b_rcred);
1687 	if( bp->b_wcred != NOCRED)
1688 		crfree(bp->b_wcred);
1689 
1690 	nswiodone += spc->spc_count;
1691 	if (--spc->spc_swp->sw_poip == 0) {
1692 		wakeup((caddr_t)spc->spc_swp);
1693 	}
1694 
1695 	if ((swap_pager_needflags & SWAP_FREE_NEEDED) ||
1696 	    swap_pager_inuse.tqh_first == 0) {
1697 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1698 		wakeup((caddr_t)&swap_pager_free);
1699 		wakeup((caddr_t)&vm_pages_needed);
1700 	}
1701 
1702 	if (vm_pageout_pages_needed) {
1703 		wakeup((caddr_t)&vm_pageout_pages_needed);
1704 	}
1705 
1706 	if ((swap_pager_inuse.tqh_first == NULL) ||
1707 	    (cnt.v_free_count < cnt.v_free_min &&
1708 	    nswiodone + cnt.v_free_count >= cnt.v_free_min) ) {
1709 		wakeup((caddr_t)&vm_pages_needed);
1710 	}
1711 	splx(s);
1712 }
1713 
1714 /*
1715  * return true if any swap control structures can be allocated
1716  */
1717 int
1718 swap_pager_ready() {
1719 	if( swap_pager_free.tqh_first)
1720 		return 1;
1721 	else
1722 		return 0;
1723 }
1724