xref: /freebsd/sys/vm/swap_pager.c (revision afe61c15161c324a7af299a9b8457aba5afc92db)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * Copyright (c) 1990 University of Utah.
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * the Systems Programming Group of the University of Utah Computer
9  * Science Department.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
40  *
41  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
42  */
43 
44 /*
45  * Quick hack to page to dedicated partition(s).
46  * TODO:
47  *	Add multiprocessor locks
48  *	Deal with async writes in a better fashion
49  */
50 
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/proc.h>
54 #include <sys/buf.h>
55 #include <sys/vnode.h>
56 #include <sys/malloc.h>
57 
58 #include <miscfs/specfs/specdev.h>
59 #include <sys/rlist.h>
60 
61 #include <vm/vm.h>
62 #include <vm/vm_pager.h>
63 #include <vm/vm_page.h>
64 #include <vm/vm_pageout.h>
65 #include <vm/swap_pager.h>
66 
67 #ifndef NPENDINGIO
68 #define NPENDINGIO	16
69 #endif
70 
71 extern int nswbuf;
72 int nswiodone;
73 extern int vm_pageout_rate_limit;
74 static int cleandone;
75 extern int hz;
76 int swap_pager_full;
77 extern vm_map_t pager_map;
78 extern int vm_pageout_pages_needed;
79 extern int vm_swap_size;
80 extern struct vnode *swapdev_vp;
81 
82 #define MAX_PAGEOUT_CLUSTER 8
83 
84 TAILQ_HEAD(swpclean, swpagerclean);
85 
86 typedef	struct swpagerclean	*swp_clean_t;
87 
88 struct swpagerclean {
89 	TAILQ_ENTRY(swpagerclean)	spc_list;
90 	int				spc_flags;
91 	struct buf			*spc_bp;
92 	sw_pager_t			spc_swp;
93 	vm_offset_t			spc_kva;
94 	vm_offset_t			spc_altkva;
95 	int				spc_count;
96 	vm_page_t			spc_m[MAX_PAGEOUT_CLUSTER];
97 } swcleanlist [NPENDINGIO] ;
98 
99 
100 extern vm_map_t kernel_map;
101 
102 /* spc_flags values */
103 #define SPC_ERROR	0x01
104 
105 #define SWB_EMPTY (-1)
106 
107 void		swap_pager_init(void);
108 vm_pager_t	swap_pager_alloc(caddr_t, vm_size_t, vm_prot_t, vm_offset_t);
109 void		swap_pager_dealloc(vm_pager_t);
110 boolean_t	swap_pager_getpage(vm_pager_t, vm_page_t, boolean_t);
111 boolean_t	swap_pager_putpage(vm_pager_t, vm_page_t, boolean_t);
112 boolean_t	swap_pager_getmulti(vm_pager_t, vm_page_t *, int, int, boolean_t);
113 boolean_t	swap_pager_haspage(vm_pager_t, vm_offset_t);
114 int		swap_pager_io(sw_pager_t, vm_page_t *, int, int, int);
115 void		swap_pager_iodone(struct buf *);
116 boolean_t	swap_pager_clean();
117 
118 extern struct pagerops swappagerops;
119 
120 struct swpclean swap_pager_done;	/* list of compileted page cleans */
121 struct swpclean swap_pager_inuse;	/* list of pending page cleans */
122 struct swpclean swap_pager_free;	/* list of free pager clean structs */
123 struct pagerlst swap_pager_list;	/* list of "named" anon regions */
124 struct pagerlst swap_pager_un_list;	/* list of "unnamed" anon pagers */
125 
126 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
127 int swap_pager_needflags;
128 struct rlist *swapfrag;
129 
130 struct pagerlst *swp_qs[]={
131 	&swap_pager_list, &swap_pager_un_list, (struct pagerlst *) 0
132 };
133 
134 int swap_pager_putmulti();
135 
136 struct pagerops swappagerops = {
137 	swap_pager_init,
138 	swap_pager_alloc,
139 	swap_pager_dealloc,
140 	swap_pager_getpage,
141 	swap_pager_getmulti,
142 	swap_pager_putpage,
143 	swap_pager_putmulti,
144 	swap_pager_haspage
145 };
146 
147 extern int nswbuf;
148 
149 int npendingio = NPENDINGIO;
150 int pendingiowait;
151 int require_swap_init;
152 void swap_pager_finish();
153 int dmmin, dmmax;
154 extern int vm_page_count;
155 
156 struct buf * getpbuf() ;
157 void relpbuf(struct buf *bp) ;
158 
159 static inline void swapsizecheck() {
160 	if( vm_swap_size < 128*btodb(PAGE_SIZE)) {
161 		if( swap_pager_full)
162 			printf("swap_pager: out of space\n");
163 		swap_pager_full = 1;
164 	} else if( vm_swap_size > 192*btodb(PAGE_SIZE))
165 		swap_pager_full = 0;
166 }
167 
168 void
169 swap_pager_init()
170 {
171 	extern int dmmin, dmmax;
172 
173 	dfltpagerops = &swappagerops;
174 
175 	TAILQ_INIT(&swap_pager_list);
176 	TAILQ_INIT(&swap_pager_un_list);
177 
178 	/*
179 	 * Initialize clean lists
180 	 */
181 	TAILQ_INIT(&swap_pager_inuse);
182 	TAILQ_INIT(&swap_pager_done);
183 	TAILQ_INIT(&swap_pager_free);
184 
185 	require_swap_init = 1;
186 
187 	/*
188 	 * Calculate the swap allocation constants.
189 	 */
190 
191 	dmmin = CLBYTES/DEV_BSIZE;
192 	dmmax = btodb(SWB_NPAGES*PAGE_SIZE)*2;
193 
194 }
195 
196 /*
197  * Allocate a pager structure and associated resources.
198  * Note that if we are called from the pageout daemon (handle == NULL)
199  * we should not wait for memory as it could resulting in deadlock.
200  */
201 vm_pager_t
202 swap_pager_alloc(handle, size, prot, offset)
203 	caddr_t handle;
204 	register vm_size_t size;
205 	vm_prot_t prot;
206 	vm_offset_t offset;
207 {
208 	register vm_pager_t pager;
209 	register sw_pager_t swp;
210 	int waitok;
211 	int i,j;
212 
213 	if (require_swap_init) {
214 		swp_clean_t spc;
215 		struct buf *bp;
216 		/*
217 		 * kva's are allocated here so that we dont need to keep
218 		 * doing kmem_alloc pageables at runtime
219 		 */
220 		for (i = 0, spc = swcleanlist; i < npendingio ; i++, spc++) {
221 			spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE);
222 			if (!spc->spc_kva) {
223 				break;
224 			}
225 			spc->spc_bp = malloc( sizeof( *bp), M_TEMP, M_NOWAIT);
226 			if (!spc->spc_bp) {
227 				kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
228 				break;
229 			}
230 			spc->spc_flags = 0;
231 			TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
232 		}
233 		require_swap_init = 0;
234 		if( size == 0)
235 			return(NULL);
236 	}
237 
238 	/*
239 	 * If this is a "named" anonymous region, look it up and
240 	 * return the appropriate pager if it exists.
241 	 */
242 	if (handle) {
243 		pager = vm_pager_lookup(&swap_pager_list, handle);
244 		if (pager != NULL) {
245 			/*
246 			 * Use vm_object_lookup to gain a reference
247 			 * to the object and also to remove from the
248 			 * object cache.
249 			 */
250 			if (vm_object_lookup(pager) == NULL)
251 				panic("swap_pager_alloc: bad object");
252 			return(pager);
253 		}
254 	}
255 
256 	if (swap_pager_full) {
257 		return(NULL);
258 	}
259 
260 	/*
261 	 * Pager doesn't exist, allocate swap management resources
262 	 * and initialize.
263 	 */
264 	waitok = handle ? M_WAITOK : M_NOWAIT;
265 	pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok);
266 	if (pager == NULL)
267 		return(NULL);
268 	swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok);
269 	if (swp == NULL) {
270 		free((caddr_t)pager, M_VMPAGER);
271 		return(NULL);
272 	}
273 	size = round_page(size);
274 	swp->sw_osize = size;
275 	swp->sw_nblocks = (btodb(size) + btodb(SWB_NPAGES * PAGE_SIZE) - 1) / btodb(SWB_NPAGES*PAGE_SIZE);
276 	swp->sw_blocks = (sw_blk_t)
277 		malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks),
278 		       M_VMPGDATA, waitok);
279 	if (swp->sw_blocks == NULL) {
280 		free((caddr_t)swp, M_VMPGDATA);
281 		free((caddr_t)pager, M_VMPAGER);
282 		return(NULL);
283 	}
284 
285 	for (i = 0; i < swp->sw_nblocks; i++) {
286 		swp->sw_blocks[i].swb_valid = 0;
287 		swp->sw_blocks[i].swb_locked = 0;
288 		for (j = 0; j < SWB_NPAGES; j++)
289 			swp->sw_blocks[i].swb_block[j] = SWB_EMPTY;
290 	}
291 
292 	swp->sw_poip = 0;
293 	if (handle) {
294 		vm_object_t object;
295 
296 		swp->sw_flags = SW_NAMED;
297 		TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list);
298 		/*
299 		 * Consistant with other pagers: return with object
300 		 * referenced.  Can't do this with handle == NULL
301 		 * since it might be the pageout daemon calling.
302 		 */
303 		object = vm_object_allocate(size);
304 		vm_object_enter(object, pager);
305 		vm_object_setpager(object, pager, 0, FALSE);
306 	} else {
307 		swp->sw_flags = 0;
308 		TAILQ_INSERT_TAIL(&swap_pager_un_list, pager, pg_list);
309 	}
310 	pager->pg_handle = handle;
311 	pager->pg_ops = &swappagerops;
312 	pager->pg_type = PG_SWAP;
313 	pager->pg_data = (caddr_t)swp;
314 
315 	return(pager);
316 }
317 
318 /*
319  * returns disk block associated with pager and offset
320  * additionally, as a side effect returns a flag indicating
321  * if the block has been written
322  */
323 
324 static int *
325 swap_pager_diskaddr(swp, offset, valid)
326 	sw_pager_t swp;
327 	vm_offset_t offset;
328 	int *valid;
329 {
330 	register sw_blk_t swb;
331 	int ix;
332 
333 	if (valid)
334 		*valid = 0;
335 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
336 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
337 		return(FALSE);
338 	}
339 	swb = &swp->sw_blocks[ix];
340 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
341 	if (valid)
342 		*valid = swb->swb_valid & (1<<ix);
343 	return &swb->swb_block[ix];
344 }
345 
346 /*
347  * Utility routine to set the valid (written) bit for
348  * a block associated with a pager and offset
349  */
350 static void
351 swap_pager_setvalid(swp, offset, valid)
352 	sw_pager_t swp;
353 	vm_offset_t offset;
354 	int valid;
355 {
356 	register sw_blk_t swb;
357 	int ix;
358 
359 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
360 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks)
361 		return;
362 
363 	swb = &swp->sw_blocks[ix];
364 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
365 	if (valid)
366 		swb->swb_valid |= (1 << ix);
367 	else
368 		swb->swb_valid &= ~(1 << ix);
369 	return;
370 }
371 
372 /*
373  * this routine allocates swap space with a fragmentation
374  * minimization policy.
375  */
376 int
377 swap_pager_getswapspace( unsigned amount, unsigned *rtval) {
378 	unsigned tmpalloc;
379 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
380 	if( amount < nblocksfrag) {
381 		if( rlist_alloc(&swapfrag, amount, rtval))
382 			return 1;
383 		if( !rlist_alloc(&swapmap, nblocksfrag, &tmpalloc))
384 			return 0;
385 		rlist_free( &swapfrag, tmpalloc+amount, tmpalloc + nblocksfrag - 1);
386 		*rtval = tmpalloc;
387 		return 1;
388 	}
389 	if( !rlist_alloc(&swapmap, amount, rtval))
390 		return 0;
391 	else
392 		return 1;
393 }
394 
395 /*
396  * this routine frees swap space with a fragmentation
397  * minimization policy.
398  */
399 void
400 swap_pager_freeswapspace( unsigned from, unsigned to) {
401 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
402 	unsigned tmpalloc;
403 	if( ((to + 1) - from) >= nblocksfrag) {
404 		while( (from + nblocksfrag) <= to + 1) {
405 			rlist_free(&swapmap, from, from + nblocksfrag - 1);
406 			from += nblocksfrag;
407 		}
408 	}
409 	if( from >= to)
410 		return;
411 	rlist_free(&swapfrag, from, to);
412 	while( rlist_alloc(&swapfrag, nblocksfrag, &tmpalloc)) {
413 		rlist_free(&swapmap, tmpalloc, tmpalloc + nblocksfrag-1);
414 	}
415 }
416 /*
417  * this routine frees swap blocks from a specified pager
418  */
419 void
420 _swap_pager_freespace(swp, start, size)
421 	sw_pager_t swp;
422 	vm_offset_t start;
423 	vm_offset_t size;
424 {
425 	vm_offset_t i;
426 	int s;
427 
428 	s = splbio();
429 	for (i = start; i < round_page(start + size - 1); i += PAGE_SIZE) {
430 		int valid;
431 		int *addr = swap_pager_diskaddr(swp, i, &valid);
432 		if (addr && *addr != SWB_EMPTY) {
433 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
434 			if( valid) {
435 				vm_swap_size += btodb(PAGE_SIZE);
436 				swap_pager_setvalid(swp, i, 0);
437 			}
438 			*addr = SWB_EMPTY;
439 		}
440 	}
441 	swapsizecheck();
442 	splx(s);
443 }
444 
445 void
446 swap_pager_freespace(pager, start, size)
447 	vm_pager_t pager;
448 	vm_offset_t start;
449 	vm_offset_t size;
450 {
451 	_swap_pager_freespace((sw_pager_t) pager->pg_data, start, size);
452 }
453 
454 /*
455  * swap_pager_reclaim frees up over-allocated space from all pagers
456  * this eliminates internal fragmentation due to allocation of space
457  * for segments that are never swapped to. It has been written so that
458  * it does not block until the rlist_free operation occurs; it keeps
459  * the queues consistant.
460  */
461 
462 /*
463  * Maximum number of blocks (pages) to reclaim per pass
464  */
465 #define MAXRECLAIM 256
466 
467 void
468 swap_pager_reclaim()
469 {
470 	vm_pager_t p;
471 	sw_pager_t swp;
472 	int i, j, k;
473 	int s;
474 	int reclaimcount;
475 	static int reclaims[MAXRECLAIM];
476 	static int in_reclaim;
477 
478 /*
479  * allow only one process to be in the swap_pager_reclaim subroutine
480  */
481 	s = splbio();
482 	if (in_reclaim) {
483 		tsleep((caddr_t) &in_reclaim, PSWP, "swrclm", 0);
484 		splx(s);
485 		return;
486 	}
487 	in_reclaim = 1;
488 	reclaimcount = 0;
489 
490 	/* for each pager queue */
491 	for (k = 0; swp_qs[k]; k++) {
492 
493 		p = swp_qs[k]->tqh_first;
494 		while (p && (reclaimcount < MAXRECLAIM)) {
495 
496 			/*
497 			 * see if any blocks associated with a pager has been
498 			 * allocated but not used (written)
499 			 */
500 			swp = (sw_pager_t) p->pg_data;
501 			for (i = 0; i < swp->sw_nblocks; i++) {
502 				sw_blk_t swb = &swp->sw_blocks[i];
503 				if( swb->swb_locked)
504 					continue;
505 				for (j = 0; j < SWB_NPAGES; j++) {
506 					if (swb->swb_block[j] != SWB_EMPTY &&
507 						(swb->swb_valid & (1 << j)) == 0) {
508 						reclaims[reclaimcount++] = swb->swb_block[j];
509 						swb->swb_block[j] = SWB_EMPTY;
510 						if (reclaimcount >= MAXRECLAIM)
511 							goto rfinished;
512 					}
513 				}
514 			}
515 			p = p->pg_list.tqe_next;
516 		}
517 	}
518 
519 rfinished:
520 
521 /*
522  * free the blocks that have been added to the reclaim list
523  */
524 	for (i = 0; i < reclaimcount; i++) {
525 		swap_pager_freeswapspace(reclaims[i], reclaims[i]+btodb(PAGE_SIZE) - 1);
526 		swapsizecheck();
527 		wakeup((caddr_t) &in_reclaim);
528 	}
529 
530 	splx(s);
531 	in_reclaim = 0;
532 	wakeup((caddr_t) &in_reclaim);
533 }
534 
535 
536 /*
537  * swap_pager_copy copies blocks from one pager to another and
538  * destroys the source pager
539  */
540 
541 void
542 swap_pager_copy(srcpager, srcoffset, dstpager, dstoffset, offset)
543 	vm_pager_t srcpager;
544 	vm_offset_t srcoffset;
545 	vm_pager_t dstpager;
546 	vm_offset_t dstoffset;
547 	vm_offset_t offset;
548 {
549 	sw_pager_t srcswp, dstswp;
550 	vm_offset_t i;
551 	int s;
552 
553 	srcswp = (sw_pager_t) srcpager->pg_data;
554 	dstswp = (sw_pager_t) dstpager->pg_data;
555 
556 /*
557  * remove the source pager from the swap_pager internal queue
558  */
559 	s = splbio();
560 	if (srcswp->sw_flags & SW_NAMED) {
561 		TAILQ_REMOVE(&swap_pager_list, srcpager, pg_list);
562 		srcswp->sw_flags &= ~SW_NAMED;
563 	} else {
564 		TAILQ_REMOVE(&swap_pager_un_list, srcpager, pg_list);
565 	}
566 
567 	while (srcswp->sw_poip) {
568 		tsleep((caddr_t)srcswp, PVM, "spgout", 0);
569 	}
570 	splx(s);
571 
572 /*
573  * clean all of the pages that are currently active and finished
574  */
575 	(void) swap_pager_clean();
576 
577 	s = splbio();
578 /*
579  * clear source block before destination object
580  * (release allocated space)
581  */
582 	for (i = 0; i < offset + srcoffset; i += PAGE_SIZE) {
583 		int valid;
584 		int *addr = swap_pager_diskaddr(srcswp, i, &valid);
585 		if (addr && *addr != SWB_EMPTY) {
586 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
587 			if( valid)
588 				vm_swap_size += btodb(PAGE_SIZE);
589 			swapsizecheck();
590 			*addr = SWB_EMPTY;
591 		}
592 	}
593 /*
594  * transfer source to destination
595  */
596 	for (i = 0; i < dstswp->sw_osize; i += PAGE_SIZE) {
597 		int srcvalid, dstvalid;
598 		int *srcaddrp = swap_pager_diskaddr(srcswp, i + offset + srcoffset,
599 			&srcvalid);
600 		int *dstaddrp;
601 	/*
602 	 * see if the source has space allocated
603 	 */
604 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
605 		/*
606 		 * if the source is valid and the dest has no space, then
607 		 * copy the allocation from the srouce to the dest.
608 		 */
609 			if (srcvalid) {
610 				dstaddrp = swap_pager_diskaddr(dstswp, i + dstoffset, &dstvalid);
611 				/*
612 				 * if the dest already has a valid block, deallocate the
613 				 * source block without copying.
614 				 */
615 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
616 					swap_pager_freeswapspace(*dstaddrp, *dstaddrp+btodb(PAGE_SIZE) - 1);
617 					*dstaddrp = SWB_EMPTY;
618 				}
619 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
620 					*dstaddrp = *srcaddrp;
621 					*srcaddrp = SWB_EMPTY;
622 					swap_pager_setvalid(dstswp, i + dstoffset, 1);
623 					vm_swap_size -= btodb(PAGE_SIZE);
624 				}
625 			}
626 		/*
627 		 * if the source is not empty at this point, then deallocate the space.
628 		 */
629 			if (*srcaddrp != SWB_EMPTY) {
630 				swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
631 				if( srcvalid)
632 					vm_swap_size += btodb(PAGE_SIZE);
633 				*srcaddrp = SWB_EMPTY;
634 			}
635 		}
636 	}
637 
638 /*
639  * deallocate the rest of the source object
640  */
641 	for (i = dstswp->sw_osize + offset + srcoffset; i < srcswp->sw_osize; i += PAGE_SIZE) {
642 		int valid;
643 		int *srcaddrp = swap_pager_diskaddr(srcswp, i, &valid);
644 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
645 			swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
646 			if( valid)
647 				vm_swap_size += btodb(PAGE_SIZE);
648 			*srcaddrp = SWB_EMPTY;
649 		}
650 	}
651 
652 	swapsizecheck();
653 	splx(s);
654 
655 	free((caddr_t)srcswp->sw_blocks, M_VMPGDATA);
656 	srcswp->sw_blocks = 0;
657 	free((caddr_t)srcswp, M_VMPGDATA);
658 	srcpager->pg_data = 0;
659 	free((caddr_t)srcpager, M_VMPAGER);
660 
661 	return;
662 }
663 
664 
665 void
666 swap_pager_dealloc(pager)
667 	vm_pager_t pager;
668 {
669 	register int i,j;
670 	register sw_blk_t bp;
671 	register sw_pager_t swp;
672 	int s;
673 
674 	/*
675 	 * Remove from list right away so lookups will fail if we
676 	 * block for pageout completion.
677 	 */
678 	s = splbio();
679 	swp = (sw_pager_t) pager->pg_data;
680 	if (swp->sw_flags & SW_NAMED) {
681 		TAILQ_REMOVE(&swap_pager_list, pager, pg_list);
682 		swp->sw_flags &= ~SW_NAMED;
683 	} else {
684 		TAILQ_REMOVE(&swap_pager_un_list, pager, pg_list);
685 	}
686 	/*
687 	 * Wait for all pageouts to finish and remove
688 	 * all entries from cleaning list.
689 	 */
690 
691 	while (swp->sw_poip) {
692 		tsleep((caddr_t)swp, PVM, "swpout", 0);
693 	}
694 	splx(s);
695 
696 
697 	(void) swap_pager_clean();
698 
699 	/*
700 	 * Free left over swap blocks
701 	 */
702 	s = splbio();
703 	for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) {
704 		for (j = 0; j < SWB_NPAGES; j++)
705 		if (bp->swb_block[j] != SWB_EMPTY) {
706 			swap_pager_freeswapspace((unsigned)bp->swb_block[j],
707 				(unsigned)bp->swb_block[j] + btodb(PAGE_SIZE) - 1);
708 			if( bp->swb_valid & (1<<j))
709 				vm_swap_size += btodb(PAGE_SIZE);
710 			bp->swb_block[j] = SWB_EMPTY;
711 		}
712 	}
713 	splx(s);
714 	swapsizecheck();
715 
716 	/*
717 	 * Free swap management resources
718 	 */
719 	free((caddr_t)swp->sw_blocks, M_VMPGDATA);
720 	swp->sw_blocks = 0;
721 	free((caddr_t)swp, M_VMPGDATA);
722 	pager->pg_data = 0;
723 	free((caddr_t)pager, M_VMPAGER);
724 }
725 
726 /*
727  * swap_pager_getmulti can get multiple pages.
728  */
729 int
730 swap_pager_getmulti(pager, m, count, reqpage, sync)
731 	vm_pager_t pager;
732 	vm_page_t *m;
733 	int count;
734 	int reqpage;
735 	boolean_t sync;
736 {
737 	if( reqpage >= count)
738 		panic("swap_pager_getmulti: reqpage >= count\n");
739 	return swap_pager_input((sw_pager_t) pager->pg_data, m, count, reqpage);
740 }
741 
742 /*
743  * swap_pager_getpage gets individual pages
744  */
745 int
746 swap_pager_getpage(pager, m, sync)
747 	vm_pager_t pager;
748 	vm_page_t m;
749 	boolean_t sync;
750 {
751 	vm_page_t marray[1];
752 
753 	marray[0] = m;
754 	return swap_pager_input((sw_pager_t)pager->pg_data, marray, 1, 0);
755 }
756 
757 int
758 swap_pager_putmulti(pager, m, c, sync, rtvals)
759 	vm_pager_t pager;
760 	vm_page_t *m;
761 	int c;
762 	boolean_t sync;
763 	int *rtvals;
764 {
765 	int flags;
766 
767 	if (pager == NULL) {
768 		(void) swap_pager_clean();
769 		return VM_PAGER_OK;
770 	}
771 
772 	flags = B_WRITE;
773 	if (!sync)
774 		flags |= B_ASYNC;
775 
776 	return swap_pager_output((sw_pager_t)pager->pg_data, m, c, flags, rtvals);
777 }
778 
779 /*
780  * swap_pager_putpage writes individual pages
781  */
782 int
783 swap_pager_putpage(pager, m, sync)
784 	vm_pager_t pager;
785 	vm_page_t m;
786 	boolean_t sync;
787 {
788 	int flags;
789 	vm_page_t marray[1];
790 	int rtvals[1];
791 
792 
793 	if (pager == NULL) {
794 		(void) swap_pager_clean();
795 		return VM_PAGER_OK;
796 	}
797 
798 	marray[0] = m;
799 	flags = B_WRITE;
800 	if (!sync)
801 		flags |= B_ASYNC;
802 
803 	swap_pager_output((sw_pager_t)pager->pg_data, marray, 1, flags, rtvals);
804 
805 	return rtvals[0];
806 }
807 
808 static inline int
809 const swap_pager_block_index(swp, offset)
810 	sw_pager_t swp;
811 	vm_offset_t offset;
812 {
813 	return (offset / (SWB_NPAGES*PAGE_SIZE));
814 }
815 
816 static inline int
817 const swap_pager_block_offset(swp, offset)
818 	sw_pager_t swp;
819 	vm_offset_t offset;
820 {
821 	return ((offset % (PAGE_SIZE*SWB_NPAGES)) / PAGE_SIZE);
822 }
823 
824 /*
825  * _swap_pager_haspage returns TRUE if the pager has data that has
826  * been written out.
827  */
828 static boolean_t
829 _swap_pager_haspage(swp, offset)
830 	sw_pager_t swp;
831 	vm_offset_t offset;
832 {
833 	register sw_blk_t swb;
834 	int ix;
835 
836 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
837 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
838 		return(FALSE);
839 	}
840 	swb = &swp->sw_blocks[ix];
841 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
842 	if (swb->swb_block[ix] != SWB_EMPTY) {
843 		if (swb->swb_valid & (1 << ix))
844 			return TRUE;
845 	}
846 
847 	return(FALSE);
848 }
849 
850 /*
851  * swap_pager_haspage is the externally accessible version of
852  * _swap_pager_haspage above.  this routine takes a vm_pager_t
853  * for an argument instead of sw_pager_t.
854  */
855 boolean_t
856 swap_pager_haspage(pager, offset)
857 	vm_pager_t pager;
858 	vm_offset_t offset;
859 {
860 	return _swap_pager_haspage((sw_pager_t) pager->pg_data, offset);
861 }
862 
863 /*
864  * swap_pager_freepage is a convienience routine that clears the busy
865  * bit and deallocates a page.
866  */
867 static void
868 swap_pager_freepage(m)
869 	vm_page_t m;
870 {
871 	PAGE_WAKEUP(m);
872 	vm_page_free(m);
873 }
874 
875 /*
876  * swap_pager_ridpages is a convienience routine that deallocates all
877  * but the required page.  this is usually used in error returns that
878  * need to invalidate the "extra" readahead pages.
879  */
880 static void
881 swap_pager_ridpages(m, count, reqpage)
882 	vm_page_t *m;
883 	int count;
884 	int reqpage;
885 {
886 	int i;
887 	for (i = 0; i < count; i++)
888 		if (i != reqpage)
889 			swap_pager_freepage(m[i]);
890 }
891 
892 int swapwritecount=0;
893 
894 /*
895  * swap_pager_iodone1 is the completion routine for both reads and async writes
896  */
897 void
898 swap_pager_iodone1(bp)
899 	struct buf *bp;
900 {
901 	bp->b_flags |= B_DONE;
902 	bp->b_flags &= ~B_ASYNC;
903 	wakeup((caddr_t)bp);
904 /*
905 	if ((bp->b_flags & B_READ) == 0)
906 		vwakeup(bp);
907 */
908 }
909 
910 
911 int
912 swap_pager_input(swp, m, count, reqpage)
913 	register sw_pager_t swp;
914 	vm_page_t *m;
915 	int count, reqpage;
916 {
917 	register struct buf *bp;
918 	sw_blk_t swb[count];
919 	register int s;
920 	int i;
921 	boolean_t rv;
922 	vm_offset_t kva, off[count];
923 	swp_clean_t spc;
924 	vm_offset_t paging_offset;
925 	vm_object_t object;
926 	int reqaddr[count];
927 
928 	int first, last;
929 	int failed;
930 	int reqdskregion;
931 
932 	object = m[reqpage]->object;
933 	paging_offset = object->paging_offset;
934 	/*
935 	 * First determine if the page exists in the pager if this is
936 	 * a sync read.  This quickly handles cases where we are
937 	 * following shadow chains looking for the top level object
938 	 * with the page.
939 	 */
940 	if (swp->sw_blocks == NULL) {
941 		swap_pager_ridpages(m, count, reqpage);
942 		return(VM_PAGER_FAIL);
943 	}
944 
945 	for(i = 0; i < count; i++) {
946 		vm_offset_t foff = m[i]->offset + paging_offset;
947 		int ix = swap_pager_block_index(swp, foff);
948 		if (ix >= swp->sw_nblocks) {
949 			int j;
950 			if( i <= reqpage) {
951 				swap_pager_ridpages(m, count, reqpage);
952 				return(VM_PAGER_FAIL);
953 			}
954 			for(j = i; j < count; j++) {
955 				swap_pager_freepage(m[j]);
956 			}
957 			count = i;
958 			break;
959 		}
960 
961 		swb[i] = &swp->sw_blocks[ix];
962 		off[i] = swap_pager_block_offset(swp, foff);
963 		reqaddr[i] = swb[i]->swb_block[off[i]];
964 	}
965 
966 	/* make sure that our required input request is existant */
967 
968 	if (reqaddr[reqpage] == SWB_EMPTY ||
969 		(swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
970 		swap_pager_ridpages(m, count, reqpage);
971 		return(VM_PAGER_FAIL);
972 	}
973 
974 
975 	reqdskregion = reqaddr[reqpage] / dmmax;
976 
977 	/*
978 	 * search backwards for the first contiguous page to transfer
979 	 */
980 	failed = 0;
981 	first = 0;
982 	for (i = reqpage - 1; i >= 0; --i) {
983 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
984 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
985 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
986 			((reqaddr[i] / dmmax) != reqdskregion)) {
987 				failed = 1;
988 				swap_pager_freepage(m[i]);
989 				if (first == 0)
990 					first = i + 1;
991 		}
992 	}
993 	/*
994 	 * search forwards for the last contiguous page to transfer
995 	 */
996 	failed = 0;
997 	last = count;
998 	for (i = reqpage + 1; i < count; i++) {
999 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
1000 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
1001 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
1002 			((reqaddr[i] / dmmax) != reqdskregion)) {
1003 				failed = 1;
1004 				swap_pager_freepage(m[i]);
1005 				if (last == count)
1006 					last = i;
1007 		}
1008 	}
1009 
1010 	count = last;
1011 	if (first != 0) {
1012 		for (i = first; i < count; i++) {
1013 			m[i-first] = m[i];
1014 			reqaddr[i-first] = reqaddr[i];
1015 			off[i-first] = off[i];
1016 		}
1017 		count -= first;
1018 		reqpage -= first;
1019 	}
1020 
1021 	++swb[reqpage]->swb_locked;
1022 
1023 	/*
1024 	 * at this point:
1025 	 * "m" is a pointer to the array of vm_page_t for paging I/O
1026 	 * "count" is the number of vm_page_t entries represented by "m"
1027 	 * "object" is the vm_object_t for I/O
1028 	 * "reqpage" is the index into "m" for the page actually faulted
1029 	 */
1030 
1031 	spc = NULL;	/* we might not use an spc data structure */
1032 	kva = 0;
1033 
1034 	/*
1035 	 * we allocate a new kva for transfers > 1 page
1036 	 * but for transfers == 1 page, the swap_pager_free list contains
1037 	 * entries that have pre-allocated kva's (for efficiency).
1038 	 */
1039 	if (count > 1) {
1040 		kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE);
1041 	}
1042 
1043 
1044 	if (!kva) {
1045 		/*
1046 		 * if a kva has not been allocated, we can only do a one page transfer,
1047 		 * so we free the other pages that might have been allocated by
1048 		 * vm_fault.
1049 		 */
1050 		swap_pager_ridpages(m, count, reqpage);
1051 		m[0] = m[reqpage];
1052 		reqaddr[0] = reqaddr[reqpage];
1053 
1054 		count = 1;
1055 		reqpage = 0;
1056 	/*
1057 	 * get a swap pager clean data structure, block until we get it
1058 	 */
1059 		if (swap_pager_free.tqh_first == NULL) {
1060 			s = splbio();
1061 			if( curproc == pageproc)
1062 				(void) swap_pager_clean();
1063 			else
1064 				wakeup((caddr_t) &vm_pages_needed);
1065 			while (swap_pager_free.tqh_first == NULL) {
1066 				swap_pager_needflags |= SWAP_FREE_NEEDED;
1067 				tsleep((caddr_t)&swap_pager_free,
1068 					PVM, "swpfre", 0);
1069 				if( curproc == pageproc)
1070 					(void) swap_pager_clean();
1071 				else
1072 					wakeup((caddr_t) &vm_pages_needed);
1073 			}
1074 			splx(s);
1075 		}
1076 		spc = swap_pager_free.tqh_first;
1077 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1078 		kva = spc->spc_kva;
1079 	}
1080 
1081 
1082 	/*
1083 	 * map our page(s) into kva for input
1084 	 */
1085 	for (i = 0; i < count; i++) {
1086 		pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i]));
1087 	}
1088 	pmap_update();
1089 
1090 
1091 	/*
1092 	 * Get a swap buffer header and perform the IO
1093 	 */
1094 	if( spc) {
1095 		bp = spc->spc_bp;
1096 		bzero(bp, sizeof *bp);
1097 		bp->b_spc = spc;
1098 	} else {
1099 		bp = getpbuf();
1100 	}
1101 
1102 	s = splbio();
1103 	bp->b_flags = B_BUSY | B_READ | B_CALL;
1104 	bp->b_iodone = swap_pager_iodone1;
1105 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1106 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1107 	crhold(bp->b_rcred);
1108 	crhold(bp->b_wcred);
1109 	bp->b_un.b_addr = (caddr_t) kva;
1110 	bp->b_blkno = reqaddr[0];
1111 	bp->b_bcount = PAGE_SIZE*count;
1112 	bp->b_bufsize = PAGE_SIZE*count;
1113 
1114 /*
1115 	VHOLD(swapdev_vp);
1116 	bp->b_vp = swapdev_vp;
1117 	if (swapdev_vp->v_type == VBLK)
1118 		bp->b_dev = swapdev_vp->v_rdev;
1119 */
1120 	bgetvp( swapdev_vp, bp);
1121 
1122 	swp->sw_piip++;
1123 
1124 	/*
1125 	 * perform the I/O
1126 	 */
1127 	VOP_STRATEGY(bp);
1128 
1129 	/*
1130 	 * wait for the sync I/O to complete
1131 	 */
1132 	while ((bp->b_flags & B_DONE) == 0) {
1133 		tsleep((caddr_t)bp, PVM, "swread", 0);
1134 	}
1135 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1136 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1137 
1138 	--swp->sw_piip;
1139 	if (swp->sw_piip == 0)
1140 		wakeup((caddr_t) swp);
1141 
1142 	/*
1143 	 * relpbuf does this, but we maintain our own buffer
1144 	 * list also...
1145 	 */
1146 	if (bp->b_vp)
1147 		brelvp(bp);
1148 
1149 	splx(s);
1150 	--swb[reqpage]->swb_locked;
1151 
1152 	/*
1153 	 * remove the mapping for kernel virtual
1154 	 */
1155 	pmap_remove(vm_map_pmap(pager_map), kva, kva + count * PAGE_SIZE);
1156 
1157 	if (spc) {
1158 		/*
1159 		 * if we have used an spc, we need to free it.
1160 		 */
1161 		if( bp->b_rcred != NOCRED)
1162 			crfree(bp->b_rcred);
1163 		if( bp->b_wcred != NOCRED)
1164 			crfree(bp->b_wcred);
1165 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1166 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1167 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1168 			wakeup((caddr_t)&swap_pager_free);
1169 		}
1170 	} else {
1171 		/*
1172 		 * free the kernel virtual addresses
1173 		 */
1174 		kmem_free_wakeup(pager_map, kva, count * PAGE_SIZE);
1175 		/*
1176 		 * release the physical I/O buffer
1177 		 */
1178 		relpbuf(bp);
1179 		/*
1180 		 * finish up input if everything is ok
1181 		 */
1182 		if( rv == VM_PAGER_OK) {
1183 			for (i = 0; i < count; i++) {
1184 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1185 				m[i]->flags |= PG_CLEAN;
1186 				m[i]->flags &= ~PG_LAUNDRY;
1187 				if (i != reqpage) {
1188 					/*
1189 					 * whether or not to leave the page activated
1190 					 * is up in the air, but we should put the page
1191 					 * on a page queue somewhere. (it already is in
1192 					 * the object).
1193 					 * After some emperical results, it is best
1194 					 * to deactivate the readahead pages.
1195 					 */
1196 					vm_page_deactivate(m[i]);
1197 					m[i]->act_count = 2;
1198 
1199 					/*
1200 					 * just in case someone was asking for this
1201 					 * page we now tell them that it is ok to use
1202 					 */
1203 					m[i]->flags &= ~PG_FAKE;
1204 					PAGE_WAKEUP(m[i]);
1205 				}
1206 			}
1207 			if( swap_pager_full) {
1208 				_swap_pager_freespace( swp, m[0]->offset+paging_offset, count*PAGE_SIZE);
1209 			}
1210 		} else {
1211 			swap_pager_ridpages(m, count, reqpage);
1212 		}
1213 	}
1214 	return(rv);
1215 }
1216 
1217 int
1218 swap_pager_output(swp, m, count, flags, rtvals)
1219 	register sw_pager_t swp;
1220 	vm_page_t *m;
1221 	int count;
1222 	int flags;
1223 	int *rtvals;
1224 {
1225 	register struct buf *bp;
1226 	sw_blk_t swb[count];
1227 	register int s;
1228 	int i, j, ix;
1229 	boolean_t rv;
1230 	vm_offset_t kva, off, foff;
1231 	swp_clean_t spc;
1232 	vm_offset_t paging_offset;
1233 	vm_object_t object;
1234 	int reqaddr[count];
1235 	int failed;
1236 
1237 /*
1238 	if( count > 1)
1239 		printf("off: 0x%x, count: %d\n", m[0]->offset, count);
1240 */
1241 	spc = NULL;
1242 
1243 	object = m[0]->object;
1244 	paging_offset = object->paging_offset;
1245 
1246 	failed = 0;
1247 	for(j=0;j<count;j++) {
1248 		foff = m[j]->offset + paging_offset;
1249 		ix = swap_pager_block_index(swp, foff);
1250 		swb[j] = 0;
1251 		if( swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
1252 			rtvals[j] = VM_PAGER_FAIL;
1253 			failed = 1;
1254 			continue;
1255 		} else {
1256 			rtvals[j] = VM_PAGER_OK;
1257 		}
1258 		swb[j] = &swp->sw_blocks[ix];
1259 		++swb[j]->swb_locked;
1260 		if( failed) {
1261 			rtvals[j] = VM_PAGER_FAIL;
1262 			continue;
1263 		}
1264 		off = swap_pager_block_offset(swp, foff);
1265 		reqaddr[j] = swb[j]->swb_block[off];
1266 		if( reqaddr[j] == SWB_EMPTY) {
1267 			int blk;
1268 			int tries;
1269 			int ntoget;
1270 			tries = 0;
1271 			s = splbio();
1272 
1273 			/*
1274 			 * if any other pages have been allocated in this block, we
1275 			 * only try to get one page.
1276 			 */
1277 			for (i = 0; i < SWB_NPAGES; i++) {
1278 				if (swb[j]->swb_block[i] != SWB_EMPTY)
1279 					break;
1280 			}
1281 
1282 
1283 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
1284 			/*
1285 			 * this code is alittle conservative, but works
1286 			 * (the intent of this code is to allocate small chunks
1287 			 *  for small objects)
1288 			 */
1289 			if( (m[j]->offset == 0) && (ntoget*PAGE_SIZE > object->size)) {
1290 				ntoget = (object->size + (PAGE_SIZE-1))/PAGE_SIZE;
1291 			}
1292 
1293 retrygetspace:
1294 			if (!swap_pager_full && ntoget > 1 &&
1295 				swap_pager_getswapspace(ntoget * btodb(PAGE_SIZE), &blk)) {
1296 
1297 				for (i = 0; i < ntoget; i++) {
1298 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
1299 					swb[j]->swb_valid = 0;
1300 				}
1301 
1302 				reqaddr[j] = swb[j]->swb_block[off];
1303 			} else if (!swap_pager_getswapspace(btodb(PAGE_SIZE),
1304 				&swb[j]->swb_block[off])) {
1305 				/*
1306 				 * if the allocation has failed, we try to reclaim space and
1307 				 * retry.
1308 				 */
1309 				if (++tries == 1) {
1310 					swap_pager_reclaim();
1311 					goto retrygetspace;
1312 				}
1313 				rtvals[j] = VM_PAGER_AGAIN;
1314 				failed = 1;
1315 			} else {
1316 				reqaddr[j] = swb[j]->swb_block[off];
1317 				swb[j]->swb_valid &= ~(1<<off);
1318 			}
1319 			splx(s);
1320 		}
1321 	}
1322 
1323 	/*
1324 	 * search forwards for the last contiguous page to transfer
1325 	 */
1326 	failed = 0;
1327 	for (i = 0; i < count; i++) {
1328 		if( failed || (reqaddr[i] != reqaddr[0] + i*btodb(PAGE_SIZE)) ||
1329 			(reqaddr[i] / dmmax) != (reqaddr[0] / dmmax) ||
1330 			(rtvals[i] != VM_PAGER_OK)) {
1331 			failed = 1;
1332 			if( rtvals[i] == VM_PAGER_OK)
1333 				rtvals[i] = VM_PAGER_AGAIN;
1334 		}
1335 	}
1336 
1337 	for(i = 0; i < count; i++) {
1338 		if( rtvals[i] != VM_PAGER_OK) {
1339 			if( swb[i])
1340 				--swb[i]->swb_locked;
1341 		}
1342 	}
1343 
1344 	for(i = 0; i < count; i++)
1345 		if( rtvals[i] != VM_PAGER_OK)
1346 			break;
1347 
1348 	if( i == 0) {
1349 		return VM_PAGER_AGAIN;
1350 	}
1351 
1352 	count = i;
1353 	for(i=0;i<count;i++) {
1354 		if( reqaddr[i] == SWB_EMPTY)
1355 			printf("I/O to empty block????\n");
1356 	}
1357 
1358 	/*
1359 	 */
1360 
1361 	/*
1362 	 * For synchronous writes, we clean up
1363 	 * all completed async pageouts.
1364 	 */
1365 	if ((flags & B_ASYNC) == 0) {
1366 		swap_pager_clean();
1367 	}
1368 
1369 	kva = 0;
1370 
1371 	/*
1372 	 * we allocate a new kva for transfers > 1 page
1373 	 * but for transfers == 1 page, the swap_pager_free list contains
1374 	 * entries that have pre-allocated kva's (for efficiency).
1375 	 */
1376 	if ( count > 1) {
1377 		kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE);
1378 		if( !kva) {
1379 			for (i = 0; i < count; i++) {
1380 				if( swb[i])
1381 					--swb[i]->swb_locked;
1382 				rtvals[i] = VM_PAGER_AGAIN;
1383 			}
1384 			return VM_PAGER_AGAIN;
1385 		}
1386 	}
1387 
1388 	/*
1389 	 * get a swap pager clean data structure, block until we get it
1390 	 */
1391 	if (swap_pager_free.tqh_first == NULL) {
1392 /*
1393 		if (flags & B_ASYNC) {
1394 			for(i=0;i<count;i++) {
1395 				rtvals[i] = VM_PAGER_AGAIN;
1396 				if( swb[i])
1397 					--swb[i]->swb_locked;
1398 			}
1399 			return VM_PAGER_AGAIN;
1400 		}
1401 */
1402 
1403 		s = splbio();
1404 		if( curproc == pageproc)
1405 			(void) swap_pager_clean();
1406 		else
1407 			wakeup((caddr_t) &vm_pages_needed);
1408 		while (swap_pager_free.tqh_first == NULL) {
1409 			swap_pager_needflags |= SWAP_FREE_NEEDED;
1410 			tsleep((caddr_t)&swap_pager_free,
1411 				PVM, "swpfre", 0);
1412 			if( curproc == pageproc)
1413 				(void) swap_pager_clean();
1414 			else
1415 				wakeup((caddr_t) &vm_pages_needed);
1416 		}
1417 		splx(s);
1418 	}
1419 
1420 	spc = swap_pager_free.tqh_first;
1421 	TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1422 	if( !kva) {
1423 		kva = spc->spc_kva;
1424 		spc->spc_altkva = 0;
1425 	} else {
1426 		spc->spc_altkva = kva;
1427 	}
1428 
1429 	/*
1430 	 * map our page(s) into kva for I/O
1431 	 */
1432 	for (i = 0; i < count; i++) {
1433 		pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i]));
1434 	}
1435 	pmap_update();
1436 
1437 	/*
1438 	 * get the base I/O offset into the swap file
1439 	 */
1440 	for(i=0;i<count;i++) {
1441 		foff = m[i]->offset + paging_offset;
1442 		off = swap_pager_block_offset(swp, foff);
1443 		/*
1444 		 * if we are setting the valid bit anew,
1445 		 * then diminish the swap free space
1446 		 */
1447 		if( (swb[i]->swb_valid & (1 << off)) == 0)
1448 			vm_swap_size -= btodb(PAGE_SIZE);
1449 
1450 		/*
1451 		 * set the valid bit
1452 		 */
1453 		swb[i]->swb_valid |= (1 << off);
1454 		/*
1455 		 * and unlock the data structure
1456 		 */
1457 		--swb[i]->swb_locked;
1458 	}
1459 
1460 	s = splbio();
1461 	/*
1462 	 * Get a swap buffer header and perform the IO
1463 	 */
1464 	bp = spc->spc_bp;
1465 	bzero(bp, sizeof *bp);
1466 	bp->b_spc = spc;
1467 
1468 	bp->b_flags = B_BUSY;
1469 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1470 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1471 	crhold(bp->b_rcred);
1472 	crhold(bp->b_wcred);
1473 	bp->b_un.b_addr = (caddr_t) kva;
1474 	bp->b_blkno = reqaddr[0];
1475 	bgetvp( swapdev_vp, bp);
1476 /*
1477 	VHOLD(swapdev_vp);
1478 	bp->b_vp = swapdev_vp;
1479 	if (swapdev_vp->v_type == VBLK)
1480 		bp->b_dev = swapdev_vp->v_rdev;
1481 */
1482 	bp->b_bcount = PAGE_SIZE*count;
1483 	bp->b_bufsize = PAGE_SIZE*count;
1484 	swapdev_vp->v_numoutput++;
1485 
1486 	/*
1487 	 * If this is an async write we set up additional buffer fields
1488 	 * and place a "cleaning" entry on the inuse queue.
1489 	 */
1490 	if ( flags & B_ASYNC ) {
1491 		spc->spc_flags = 0;
1492 		spc->spc_swp = swp;
1493 		for(i=0;i<count;i++)
1494 			spc->spc_m[i] = m[i];
1495 		spc->spc_count = count;
1496 		/*
1497 		 * the completion routine for async writes
1498 		 */
1499 		bp->b_flags |= B_CALL;
1500 		bp->b_iodone = swap_pager_iodone;
1501 		bp->b_dirtyoff = 0;
1502 		bp->b_dirtyend = bp->b_bcount;
1503 		swp->sw_poip++;
1504 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
1505 	} else {
1506 		swp->sw_poip++;
1507 		bp->b_flags |= B_CALL;
1508 		bp->b_iodone = swap_pager_iodone1;
1509 	}
1510 	/*
1511 	 * perform the I/O
1512 	 */
1513 	VOP_STRATEGY(bp);
1514 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC ) {
1515 		if ((bp->b_flags & B_DONE) == B_DONE) {
1516 			swap_pager_clean();
1517 		}
1518 		splx(s);
1519 		for(i=0;i<count;i++) {
1520 			rtvals[i] = VM_PAGER_PEND;
1521 		}
1522 		return VM_PAGER_PEND;
1523 	}
1524 
1525 	/*
1526 	 * wait for the sync I/O to complete
1527 	 */
1528 	while ((bp->b_flags & B_DONE) == 0) {
1529 		tsleep((caddr_t)bp, PVM, "swwrt", 0);
1530 	}
1531 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1532 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1533 
1534 	--swp->sw_poip;
1535 	if (swp->sw_poip == 0)
1536 		wakeup((caddr_t) swp);
1537 
1538 	if (bp->b_vp)
1539 		brelvp(bp);
1540 
1541 	splx(s);
1542 
1543 	/*
1544 	 * remove the mapping for kernel virtual
1545 	 */
1546 	pmap_remove(vm_map_pmap(pager_map), kva, kva + count * PAGE_SIZE);
1547 
1548 	/*
1549 	 * if we have written the page, then indicate that the page
1550 	 * is clean.
1551 	 */
1552 	if (rv == VM_PAGER_OK) {
1553 		for(i=0;i<count;i++) {
1554 			if( rtvals[i] == VM_PAGER_OK) {
1555 				m[i]->flags |= PG_CLEAN;
1556 				m[i]->flags &= ~PG_LAUNDRY;
1557 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1558 				/*
1559 				 * optimization, if a page has been read during the
1560 				 * pageout process, we activate it.
1561 				 */
1562 				if ( (m[i]->flags & PG_ACTIVE) == 0 &&
1563 					pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))
1564 					vm_page_activate(m[i]);
1565 			}
1566 		}
1567 	} else {
1568 		for(i=0;i<count;i++) {
1569 			rtvals[i] = rv;
1570 			m[i]->flags |= PG_LAUNDRY;
1571 		}
1572 	}
1573 
1574 	if( spc->spc_altkva)
1575 		kmem_free_wakeup(pager_map, kva, count * PAGE_SIZE);
1576 
1577 	if( bp->b_rcred != NOCRED)
1578 		crfree(bp->b_rcred);
1579 	if( bp->b_wcred != NOCRED)
1580 		crfree(bp->b_wcred);
1581 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1582 	if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1583 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1584 		wakeup((caddr_t)&swap_pager_free);
1585 	}
1586 
1587 	return(rv);
1588 }
1589 
1590 boolean_t
1591 swap_pager_clean()
1592 {
1593 	register swp_clean_t spc, tspc;
1594 	register int s;
1595 
1596 	tspc = NULL;
1597 	if (swap_pager_done.tqh_first == NULL)
1598 		return FALSE;
1599 	for (;;) {
1600 		s = splbio();
1601 		/*
1602 		 * Look up and removal from done list must be done
1603 		 * at splbio() to avoid conflicts with swap_pager_iodone.
1604 		 */
1605 		while (spc = swap_pager_done.tqh_first) {
1606 			if( spc->spc_altkva) {
1607 				pmap_remove(vm_map_pmap(pager_map), spc->spc_altkva, spc->spc_altkva + spc->spc_count * PAGE_SIZE);
1608 				kmem_free_wakeup(pager_map, spc->spc_altkva, spc->spc_count * PAGE_SIZE);
1609 				spc->spc_altkva = 0;
1610 			} else {
1611 				pmap_remove(vm_map_pmap(pager_map), spc->spc_kva, spc->spc_kva + PAGE_SIZE);
1612 			}
1613 			swap_pager_finish(spc);
1614 			TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
1615 			goto doclean;
1616 		}
1617 
1618 		/*
1619 		 * No operations done, thats all we can do for now.
1620 		 */
1621 
1622 		splx(s);
1623 		break;
1624 
1625 		/*
1626 		 * The desired page was found to be busy earlier in
1627 		 * the scan but has since completed.
1628 		 */
1629 doclean:
1630 		if (tspc && tspc == spc) {
1631 			tspc = NULL;
1632 		}
1633 		spc->spc_flags = 0;
1634 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1635 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1636 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1637 			wakeup((caddr_t)&swap_pager_free);
1638 		}
1639 		++cleandone;
1640 		splx(s);
1641 	}
1642 
1643 	return(tspc ? TRUE : FALSE);
1644 }
1645 
1646 void
1647 swap_pager_finish(spc)
1648 	register swp_clean_t spc;
1649 {
1650 	vm_object_t object = spc->spc_m[0]->object;
1651 	int i;
1652 
1653 	if ((object->paging_in_progress -= spc->spc_count) == 0)
1654 		thread_wakeup((int) object);
1655 
1656 	/*
1657 	 * If no error mark as clean and inform the pmap system.
1658 	 * If error, mark as dirty so we will try again.
1659 	 * (XXX could get stuck doing this, should give up after awhile)
1660 	 */
1661 	if (spc->spc_flags & SPC_ERROR) {
1662 		for(i=0;i<spc->spc_count;i++) {
1663 			printf("swap_pager_finish: clean of page %x failed\n",
1664 			       VM_PAGE_TO_PHYS(spc->spc_m[i]));
1665 			spc->spc_m[i]->flags |= PG_LAUNDRY;
1666 		}
1667 	} else {
1668 		for(i=0;i<spc->spc_count;i++) {
1669 			pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i]));
1670 			spc->spc_m[i]->flags |= PG_CLEAN;
1671 		}
1672 	}
1673 
1674 
1675 	for(i=0;i<spc->spc_count;i++) {
1676 		/*
1677 		 * we wakeup any processes that are waiting on
1678 		 * these pages.
1679 		 */
1680 		PAGE_WAKEUP(spc->spc_m[i]);
1681 	}
1682 	nswiodone -= spc->spc_count;
1683 
1684 	return;
1685 }
1686 
1687 /*
1688  * swap_pager_iodone
1689  */
1690 void
1691 swap_pager_iodone(bp)
1692 	register struct buf *bp;
1693 {
1694 	register swp_clean_t spc;
1695 	int s;
1696 
1697 	s = splbio();
1698 	spc = (swp_clean_t) bp->b_spc;
1699 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
1700 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
1701 	if (bp->b_flags & B_ERROR) {
1702 		spc->spc_flags |= SPC_ERROR;
1703 		printf("error %d blkno %d sz %d ",
1704 			bp->b_error, bp->b_blkno, bp->b_bcount);
1705 	}
1706 
1707 /*
1708 	if ((bp->b_flags & B_READ) == 0)
1709 		vwakeup(bp);
1710 */
1711 
1712 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_ASYNC);
1713 	if (bp->b_vp) {
1714 		brelvp(bp);
1715 	}
1716 	if( bp->b_rcred != NOCRED)
1717 		crfree(bp->b_rcred);
1718 	if( bp->b_wcred != NOCRED)
1719 		crfree(bp->b_wcred);
1720 
1721 	nswiodone += spc->spc_count;
1722 	if (--spc->spc_swp->sw_poip == 0) {
1723 		wakeup((caddr_t)spc->spc_swp);
1724 	}
1725 
1726 	if ((swap_pager_needflags & SWAP_FREE_NEEDED) ||
1727 	    swap_pager_inuse.tqh_first == 0) {
1728 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1729 		wakeup((caddr_t)&swap_pager_free);
1730 		wakeup((caddr_t)&vm_pages_needed);
1731 	}
1732 
1733 	if (vm_pageout_pages_needed) {
1734 		wakeup((caddr_t)&vm_pageout_pages_needed);
1735 	}
1736 
1737 	if ((swap_pager_inuse.tqh_first == NULL) ||
1738 	    (cnt.v_free_count < cnt.v_free_min &&
1739 	    nswiodone + cnt.v_free_count >= cnt.v_free_min) ) {
1740 		wakeup((caddr_t)&vm_pages_needed);
1741 	}
1742 	splx(s);
1743 }
1744 
1745 int bswneeded;
1746 /* TAILQ_HEAD(swqueue, buf) bswlist; */
1747 /*
1748  * allocate a physical buffer
1749  */
1750 struct buf *
1751 getpbuf() {
1752 	int s;
1753 	struct buf *bp;
1754 
1755 	s = splbio();
1756 	/* get a bp from the swap buffer header pool */
1757 	while ((bp = bswlist.tqh_first) == NULL) {
1758 		bswneeded = 1;
1759 		tsleep((caddr_t)&bswneeded, PVM, "wswbuf", 0);
1760 	}
1761 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
1762 
1763 	splx(s);
1764 
1765 	bzero(bp, sizeof *bp);
1766 	bp->b_rcred = NOCRED;
1767 	bp->b_wcred = NOCRED;
1768 	return bp;
1769 }
1770 
1771 /*
1772  * allocate a physical buffer, if one is available
1773  */
1774 struct buf *
1775 trypbuf() {
1776 	int s;
1777 	struct buf *bp;
1778 
1779 	s = splbio();
1780 	if ((bp = bswlist.tqh_first) == NULL) {
1781 		splx(s);
1782 		return NULL;
1783 	}
1784 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
1785 	splx(s);
1786 
1787 	bzero(bp, sizeof *bp);
1788 	bp->b_rcred = NOCRED;
1789 	bp->b_wcred = NOCRED;
1790 	return bp;
1791 }
1792 
1793 /*
1794  * release a physical buffer
1795  */
1796 void
1797 relpbuf(bp)
1798 	struct buf *bp;
1799 {
1800 	int s;
1801 
1802 	s = splbio();
1803 
1804 	if (bp->b_rcred != NOCRED) {
1805 		crfree(bp->b_rcred);
1806 		bp->b_rcred = NOCRED;
1807 	}
1808 	if (bp->b_wcred != NOCRED) {
1809 		crfree(bp->b_wcred);
1810 		bp->b_wcred = NOCRED;
1811 	}
1812 
1813 	if (bp->b_vp)
1814 		brelvp(bp);
1815 
1816 	TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist);
1817 
1818 	if (bswneeded) {
1819 		bswneeded = 0;
1820 		wakeup((caddr_t)&bswlist);
1821 	}
1822 	splx(s);
1823 }
1824 
1825 /*
1826  * return true if any swap control structures can be allocated
1827  */
1828 int
1829 swap_pager_ready() {
1830 	if( swap_pager_free.tqh_first)
1831 		return 1;
1832 	else
1833 		return 0;
1834 }
1835