xref: /freebsd/sys/vm/swap_pager.c (revision 380a989b3223d455375b4fae70fd0b9bdd43bafb)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * Copyright (c) 1990 University of Utah.
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * the Systems Programming Group of the University of Utah Computer
9  * Science Department.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
40  *
41  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
42  * $Id: swap_pager.c,v 1.105 1998/12/29 22:53:51 dt Exp $
43  */
44 
45 /*
46  * Quick hack to page to dedicated partition(s).
47  * TODO:
48  *	Add multiprocessor locks
49  *	Deal with async writes in a better fashion
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/proc.h>
56 #include <sys/buf.h>
57 #include <sys/vnode.h>
58 #include <sys/malloc.h>
59 #include <sys/vmmeter.h>
60 #include <sys/rlist.h>
61 
62 #ifndef MAX_PAGEOUT_CLUSTER
63 #define MAX_PAGEOUT_CLUSTER 16
64 #endif
65 
66 #ifndef NPENDINGIO
67 #define NPENDINGIO	16
68 #endif
69 
70 #define SWB_NPAGES MAX_PAGEOUT_CLUSTER
71 
72 #include <vm/vm.h>
73 #include <vm/vm_prot.h>
74 #include <vm/vm_object.h>
75 #include <vm/vm_page.h>
76 #include <vm/vm_pager.h>
77 #include <vm/vm_pageout.h>
78 #include <vm/swap_pager.h>
79 #include <vm/vm_extern.h>
80 
81 static int nswiodone;
82 int swap_pager_full;
83 extern int vm_swap_size;
84 static int no_swap_space = 1;
85 static int max_pageout_cluster;
86 struct rlisthdr swaplist;
87 
88 TAILQ_HEAD(swpclean, swpagerclean);
89 
90 typedef struct swpagerclean *swp_clean_t;
91 
92 static struct swpagerclean {
93 	TAILQ_ENTRY(swpagerclean) spc_list;
94 	int spc_flags;
95 	struct buf *spc_bp;
96 	vm_object_t spc_object;
97 	vm_offset_t spc_kva;
98 	int spc_first;
99 	int spc_count;
100 	vm_page_t spc_m[MAX_PAGEOUT_CLUSTER];
101 } swcleanlist[NPENDINGIO];
102 
103 
104 /* spc_flags values */
105 #define SPC_ERROR	0x01
106 
107 #define SWB_EMPTY (-1)
108 
109 /* list of completed page cleans */
110 static struct swpclean swap_pager_done;
111 
112 /* list of pending page cleans */
113 static struct swpclean swap_pager_inuse;
114 
115 /* list of free pager clean structs */
116 static struct swpclean swap_pager_free;
117 static int swap_pager_free_count;
118 static int swap_pager_free_pending;
119 
120 /* list of "named" anon region objects */
121 static struct pagerlst swap_pager_object_list;
122 
123 /* list of "unnamed" anon region objects */
124 struct pagerlst swap_pager_un_object_list;
125 
126 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
127 #define SWAP_FREE_NEEDED_BY_PAGEOUT 0x2
128 static int swap_pager_needflags;
129 
130 static struct pagerlst *swp_qs[] = {
131 	&swap_pager_object_list, &swap_pager_un_object_list, (struct pagerlst *) 0
132 };
133 
134 /*
135  * pagerops for OBJT_SWAP - "swap pager".
136  */
137 static vm_object_t
138 		swap_pager_alloc __P((void *handle, vm_ooffset_t size,
139 				      vm_prot_t prot, vm_ooffset_t offset));
140 static void	swap_pager_dealloc __P((vm_object_t object));
141 static boolean_t
142 		swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex,
143 					int *before, int *after));
144 static int	swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
145 static void	swap_pager_init __P((void));
146 static void spc_free __P((swp_clean_t));
147 
148 struct pagerops swappagerops = {
149 	swap_pager_init,
150 	swap_pager_alloc,
151 	swap_pager_dealloc,
152 	swap_pager_getpages,
153 	swap_pager_putpages,
154 	swap_pager_haspage,
155 	swap_pager_sync
156 };
157 
158 static int npendingio;
159 static int dmmin;
160 int dmmax;
161 
162 static int	swap_pager_block_index __P((vm_pindex_t pindex));
163 static int	swap_pager_block_offset __P((vm_pindex_t pindex));
164 static daddr_t *swap_pager_diskaddr __P((vm_object_t object,
165 					  vm_pindex_t pindex, int *valid));
166 static void	swap_pager_finish __P((swp_clean_t spc));
167 static void	swap_pager_free_swap __P((vm_object_t object));
168 static void	swap_pager_freeswapspace __P((vm_object_t object,
169 					      unsigned int from,
170 					      unsigned int to));
171 static int	swap_pager_getswapspace __P((vm_object_t object,
172 					     unsigned int amount,
173 					     daddr_t *rtval));
174 static void	swap_pager_iodone __P((struct buf *));
175 static void	swap_pager_iodone1 __P((struct buf *bp));
176 static void	swap_pager_reclaim __P((void));
177 static void	swap_pager_ridpages __P((vm_page_t *m, int count,
178 					 int reqpage));
179 static void	swap_pager_setvalid __P((vm_object_t object,
180 					 vm_offset_t offset, int valid));
181 static __inline void	swapsizecheck __P((void));
182 
183 #define SWAPLOW (vm_swap_size < (512 * btodb(PAGE_SIZE)))
184 
185 static __inline void
186 swapsizecheck()
187 {
188 	if (vm_swap_size < 128 * btodb(PAGE_SIZE)) {
189 		if (swap_pager_full == 0)
190 			printf("swap_pager: out of swap space\n");
191 		swap_pager_full = 1;
192 	} else if (vm_swap_size > 192 * btodb(PAGE_SIZE))
193 		swap_pager_full = 0;
194 }
195 
196 static void
197 swap_pager_init()
198 {
199 	int maxsafepending;
200 	TAILQ_INIT(&swap_pager_object_list);
201 	TAILQ_INIT(&swap_pager_un_object_list);
202 
203 	/*
204 	 * Initialize clean lists
205 	 */
206 	TAILQ_INIT(&swap_pager_inuse);
207 	TAILQ_INIT(&swap_pager_done);
208 	TAILQ_INIT(&swap_pager_free);
209 	swap_pager_free_count = 0;
210 
211 	/*
212 	 * Calculate the swap allocation constants.
213 	 */
214 	dmmin = PAGE_SIZE / DEV_BSIZE;
215 	dmmax = btodb(SWB_NPAGES * PAGE_SIZE) * 2;
216 
217 	maxsafepending = cnt.v_free_min - cnt.v_free_reserved;
218 	npendingio = NPENDINGIO;
219 	max_pageout_cluster = MAX_PAGEOUT_CLUSTER;
220 
221 	if ((2 * NPENDINGIO * MAX_PAGEOUT_CLUSTER) > maxsafepending) {
222 		max_pageout_cluster = MAX_PAGEOUT_CLUSTER / 2;
223 		npendingio = maxsafepending / (2 * max_pageout_cluster);
224 		if (npendingio < 2)
225 			npendingio = 2;
226 	}
227 }
228 
229 void
230 swap_pager_swap_init()
231 {
232 	swp_clean_t spc;
233 	struct buf *bp;
234 	int i;
235 
236 	/*
237 	 * kva's are allocated here so that we dont need to keep doing
238 	 * kmem_alloc pageables at runtime
239 	 */
240 	for (i = 0, spc = swcleanlist; i < npendingio; i++, spc++) {
241 		spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE * max_pageout_cluster);
242 		if (!spc->spc_kva) {
243 			break;
244 		}
245 		spc->spc_bp = malloc(sizeof(*bp), M_TEMP, M_KERNEL);
246 		if (!spc->spc_bp) {
247 			kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
248 			break;
249 		}
250 		spc->spc_flags = 0;
251 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
252 		swap_pager_free_count++;
253 	}
254 }
255 
256 int
257 swap_pager_swp_alloc(object, wait)
258 	vm_object_t object;
259 	int wait;
260 {
261 	sw_blk_t swb;
262 	int nblocks;
263 	int i, j;
264 
265 	nblocks = (object->size + SWB_NPAGES - 1) / SWB_NPAGES;
266 	swb = malloc(nblocks * sizeof(*swb), M_VMPGDATA, wait);
267 	if (swb == NULL)
268 		return 1;
269 
270 	for (i = 0; i < nblocks; i++) {
271 		swb[i].swb_valid = 0;
272 		swb[i].swb_locked = 0;
273 		for (j = 0; j < SWB_NPAGES; j++)
274 			swb[i].swb_block[j] = SWB_EMPTY;
275 	}
276 
277 	object->un_pager.swp.swp_nblocks = nblocks;
278 	object->un_pager.swp.swp_allocsize = 0;
279 	object->un_pager.swp.swp_blocks = swb;
280 	object->un_pager.swp.swp_poip = 0;
281 
282 	if (object->handle != NULL) {
283 		TAILQ_INSERT_TAIL(&swap_pager_object_list, object, pager_object_list);
284 	} else {
285 		TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
286 	}
287 
288 	return 0;
289 }
290 
291 /*
292  * Allocate an object and associated resources.
293  * Note that if we are called from the pageout daemon (handle == NULL)
294  * we should not wait for memory as it could resulting in deadlock.
295  */
296 static vm_object_t
297 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
298 		 vm_ooffset_t offset)
299 {
300 	vm_object_t object;
301 
302 	/*
303 	 * If this is a "named" anonymous region, look it up and use the
304 	 * object if it exists, otherwise allocate a new one.
305 	 */
306 	if (handle) {
307 		object = vm_pager_object_lookup(&swap_pager_object_list, handle);
308 		if (object != NULL) {
309 			vm_object_reference(object);
310 		} else {
311 			/*
312 			 * XXX - there is a race condition here. Two processes
313 			 * can request the same named object simultaneuously,
314 			 * and if one blocks for memory, the result is a disaster.
315 			 * Probably quite rare, but is yet another reason to just
316 			 * rip support of "named anonymous regions" out altogether.
317 			 */
318 			object = vm_object_allocate(OBJT_SWAP,
319 				OFF_TO_IDX(offset + PAGE_MASK + size));
320 			object->handle = handle;
321 			(void) swap_pager_swp_alloc(object, M_WAITOK);
322 		}
323 	} else {
324 		object = vm_object_allocate(OBJT_SWAP,
325 			OFF_TO_IDX(offset + PAGE_MASK + size));
326 		(void) swap_pager_swp_alloc(object, M_WAITOK);
327 	}
328 
329 	return (object);
330 }
331 
332 /*
333  * returns disk block associated with pager and offset
334  * additionally, as a side effect returns a flag indicating
335  * if the block has been written
336  */
337 
338 static __inline daddr_t *
339 swap_pager_diskaddr(object, pindex, valid)
340 	vm_object_t object;
341 	vm_pindex_t pindex;
342 	int *valid;
343 {
344 	register sw_blk_t swb;
345 	int ix;
346 
347 	if (valid)
348 		*valid = 0;
349 	ix = pindex / SWB_NPAGES;
350 	if ((ix >= object->un_pager.swp.swp_nblocks) ||
351 	    (pindex >= object->size)) {
352 		return (FALSE);
353 	}
354 	swb = &object->un_pager.swp.swp_blocks[ix];
355 	ix = pindex % SWB_NPAGES;
356 	if (valid)
357 		*valid = swb->swb_valid & (1 << ix);
358 	return &swb->swb_block[ix];
359 }
360 
361 /*
362  * Utility routine to set the valid (written) bit for
363  * a block associated with a pager and offset
364  */
365 static void
366 swap_pager_setvalid(object, offset, valid)
367 	vm_object_t object;
368 	vm_offset_t offset;
369 	int valid;
370 {
371 	register sw_blk_t swb;
372 	int ix;
373 
374 	ix = offset / SWB_NPAGES;
375 	if (ix >= object->un_pager.swp.swp_nblocks)
376 		return;
377 
378 	swb = &object->un_pager.swp.swp_blocks[ix];
379 	ix = offset % SWB_NPAGES;
380 	if (valid)
381 		swb->swb_valid |= (1 << ix);
382 	else
383 		swb->swb_valid &= ~(1 << ix);
384 	return;
385 }
386 
387 /*
388  * this routine allocates swap space with a fragmentation
389  * minimization policy.
390  */
391 static int
392 swap_pager_getswapspace(object, amount, rtval)
393 	vm_object_t object;
394 	unsigned int amount;
395 	daddr_t *rtval;
396 {
397 	unsigned location;
398 
399 	vm_swap_size -= amount;
400 
401 	if (!rlist_alloc(&swaplist, amount, &location)) {
402 		vm_swap_size += amount;
403 		return 0;
404 	} else {
405 		swapsizecheck();
406 		object->un_pager.swp.swp_allocsize += amount;
407 		*rtval = location;
408 		return 1;
409 	}
410 }
411 
412 /*
413  * this routine frees swap space with a fragmentation
414  * minimization policy.
415  */
416 static void
417 swap_pager_freeswapspace(object, from, to)
418 	vm_object_t object;
419 	unsigned int from;
420 	unsigned int to;
421 {
422 	rlist_free(&swaplist, from, to);
423 	vm_swap_size += (to - from) + 1;
424 	object->un_pager.swp.swp_allocsize -= (to - from) + 1;
425 	swapsizecheck();
426 }
427 /*
428  * this routine frees swap blocks from a specified pager
429  */
430 void
431 swap_pager_freespace(object, start, size)
432 	vm_object_t object;
433 	vm_pindex_t start;
434 	vm_size_t size;
435 {
436 	vm_pindex_t i;
437 	int s;
438 
439 	s = splvm();
440 	for (i = start; i < start + size; i += 1) {
441 		int valid;
442 		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
443 
444 		if (addr && *addr != SWB_EMPTY) {
445 			swap_pager_freeswapspace(object, *addr, *addr + btodb(PAGE_SIZE) - 1);
446 			if (valid) {
447 				swap_pager_setvalid(object, i, 0);
448 			}
449 			*addr = SWB_EMPTY;
450 		}
451 	}
452 	splx(s);
453 }
454 
455 /*
456  * same as freespace, but don't free, just force a DMZ next time
457  */
458 void
459 swap_pager_dmzspace(object, start, size)
460 	vm_object_t object;
461 	vm_pindex_t start;
462 	vm_size_t size;
463 {
464 	vm_pindex_t i;
465 	int s;
466 
467 	s = splvm();
468 	for (i = start; i < start + size; i += 1) {
469 		int valid;
470 		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
471 
472 		if (addr && *addr != SWB_EMPTY) {
473 			if (valid) {
474 				swap_pager_setvalid(object, i, 0);
475 			}
476 		}
477 	}
478 	splx(s);
479 }
480 
481 static void
482 swap_pager_free_swap(object)
483 	vm_object_t object;
484 {
485 	register int i, j;
486 	register sw_blk_t swb;
487 	int first_block=0, block_count=0;
488 	int s;
489 	/*
490 	 * Free left over swap blocks
491 	 */
492 	swb = object->un_pager.swp.swp_blocks;
493 	if (swb == NULL) {
494 		return;
495 	}
496 
497 	s = splvm();
498 	for (i = 0; i < object->un_pager.swp.swp_nblocks; i++, swb++) {
499 		for (j = 0; j < SWB_NPAGES; j++) {
500 			if (swb->swb_block[j] != SWB_EMPTY) {
501 				/*
502    				 * initially the length of the run is zero
503    				 */
504 				if (block_count == 0) {
505 					first_block = swb->swb_block[j];
506 					block_count = btodb(PAGE_SIZE);
507 					swb->swb_block[j] = SWB_EMPTY;
508 				/*
509    				 * if the new block can be included into the current run
510    				 */
511 				} else if (swb->swb_block[j] == first_block + block_count) {
512 					block_count += btodb(PAGE_SIZE);
513 					swb->swb_block[j] = SWB_EMPTY;
514 				/*
515    				 * terminate the previous run, and start a new one
516    				 */
517 				} else {
518 					swap_pager_freeswapspace(object, first_block,
519    					(unsigned) first_block + block_count - 1);
520 					first_block = swb->swb_block[j];
521 					block_count = btodb(PAGE_SIZE);
522 					swb->swb_block[j] = SWB_EMPTY;
523 				}
524 			}
525 		}
526 	}
527 
528 	if (block_count) {
529 		swap_pager_freeswapspace(object, first_block,
530 		   	 (unsigned) first_block + block_count - 1);
531 	}
532 	splx(s);
533 }
534 
535 
536 /*
537  * swap_pager_reclaim frees up over-allocated space from all pagers
538  * this eliminates internal fragmentation due to allocation of space
539  * for segments that are never swapped to. It has been written so that
540  * it does not block until the rlist_free operation occurs; it keeps
541  * the queues consistant.
542  */
543 
544 /*
545  * Maximum number of blocks (pages) to reclaim per pass
546  */
547 #define MAXRECLAIM 128
548 
549 static void
550 swap_pager_reclaim()
551 {
552 	vm_object_t object;
553 	int i, j, k;
554 	int s;
555 	int reclaimcount;
556 	static struct {
557 		int address;
558 		vm_object_t object;
559 	} reclaims[MAXRECLAIM];
560 	static int in_reclaim;
561 
562 	/*
563 	 * allow only one process to be in the swap_pager_reclaim subroutine
564 	 */
565 	s = splvm();
566 	if (in_reclaim) {
567 		tsleep(&in_reclaim, PSWP, "swrclm", 0);
568 		splx(s);
569 		return;
570 	}
571 	in_reclaim = 1;
572 	reclaimcount = 0;
573 
574 	/* for each pager queue */
575 	for (k = 0; swp_qs[k]; k++) {
576 
577 		object = TAILQ_FIRST(swp_qs[k]);
578 		while (object && (reclaimcount < MAXRECLAIM)) {
579 
580 			/*
581 			 * see if any blocks associated with a pager has been
582 			 * allocated but not used (written)
583 			 */
584 			if ((object->flags & OBJ_DEAD) == 0 &&
585 				(object->paging_in_progress == 0)) {
586 				for (i = 0; i < object->un_pager.swp.swp_nblocks; i++) {
587 					sw_blk_t swb = &object->un_pager.swp.swp_blocks[i];
588 
589 					if (swb->swb_locked)
590 						continue;
591 					for (j = 0; j < SWB_NPAGES; j++) {
592 						if (swb->swb_block[j] != SWB_EMPTY &&
593 						    (swb->swb_valid & (1 << j)) == 0) {
594 							reclaims[reclaimcount].address = swb->swb_block[j];
595 							reclaims[reclaimcount++].object = object;
596 							swb->swb_block[j] = SWB_EMPTY;
597 							if (reclaimcount >= MAXRECLAIM)
598 								goto rfinished;
599 						}
600 					}
601 				}
602 			}
603 			object = TAILQ_NEXT(object, pager_object_list);
604 		}
605 	}
606 
607 rfinished:
608 
609 	/*
610 	 * free the blocks that have been added to the reclaim list
611 	 */
612 	for (i = 0; i < reclaimcount; i++) {
613 		swap_pager_freeswapspace(reclaims[i].object,
614 		    reclaims[i].address, reclaims[i].address + btodb(PAGE_SIZE) - 1);
615 	}
616 	splx(s);
617 	in_reclaim = 0;
618 	wakeup(&in_reclaim);
619 }
620 
621 
622 /*
623  * swap_pager_copy copies blocks from one pager to another and
624  * destroys the source pager
625  */
626 
627 void
628 swap_pager_copy(srcobject, srcoffset, dstobject, dstoffset,
629 	offset, destroysource)
630 	vm_object_t srcobject;
631 	vm_pindex_t srcoffset;
632 	vm_object_t dstobject;
633 	vm_pindex_t dstoffset;
634 	vm_pindex_t offset;
635 	int destroysource;
636 {
637 	vm_pindex_t i;
638 	int origsize;
639 	int s;
640 
641 	if (vm_swap_size)
642 		no_swap_space = 0;
643 
644 	origsize = srcobject->un_pager.swp.swp_allocsize;
645 
646 	/*
647 	 * remove the source object from the swap_pager internal queue
648 	 */
649 	if (destroysource) {
650 		if (srcobject->handle == NULL) {
651 			TAILQ_REMOVE(&swap_pager_un_object_list, srcobject, pager_object_list);
652 		} else {
653 			TAILQ_REMOVE(&swap_pager_object_list, srcobject, pager_object_list);
654 		}
655 	}
656 
657 	s = splvm();
658 	while (srcobject->un_pager.swp.swp_poip) {
659 		tsleep(srcobject, PVM, "spgout", 0);
660 	}
661 
662 	/*
663 	 * clean all of the pages that are currently active and finished
664 	 */
665 	if (swap_pager_free_pending)
666 		swap_pager_sync();
667 
668 	/*
669 	 * transfer source to destination
670 	 */
671 	for (i = 0; i < dstobject->size; i += 1) {
672 		int srcvalid, dstvalid;
673 		daddr_t *srcaddrp = swap_pager_diskaddr(srcobject,
674 				i + offset + srcoffset, &srcvalid);
675 		daddr_t *dstaddrp;
676 
677 		/*
678 		 * see if the source has space allocated
679 		 */
680 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
681 			/*
682 			 * if the source is valid and the dest has no space,
683 			 * then copy the allocation from the srouce to the
684 			 * dest.
685 			 */
686 			if (srcvalid) {
687 				dstaddrp = swap_pager_diskaddr(dstobject, i + dstoffset,
688 							&dstvalid);
689 				/*
690 				 * if the dest already has a valid block,
691 				 * deallocate the source block without
692 				 * copying.
693 				 */
694 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
695 					swap_pager_freeswapspace(dstobject, *dstaddrp,
696 						*dstaddrp + btodb(PAGE_SIZE) - 1);
697 					*dstaddrp = SWB_EMPTY;
698 				}
699 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
700 					*dstaddrp = *srcaddrp;
701 					*srcaddrp = SWB_EMPTY;
702 					dstobject->un_pager.swp.swp_allocsize += btodb(PAGE_SIZE);
703 					srcobject->un_pager.swp.swp_allocsize -= btodb(PAGE_SIZE);
704 					swap_pager_setvalid(dstobject, i + dstoffset, 1);
705 				}
706 			}
707 			/*
708 			 * if the source is not empty at this point, then
709 			 * deallocate the space.
710 			 */
711 			if (*srcaddrp != SWB_EMPTY) {
712 				swap_pager_freeswapspace(srcobject, *srcaddrp,
713 					*srcaddrp + btodb(PAGE_SIZE) - 1);
714 				*srcaddrp = SWB_EMPTY;
715 			}
716 		}
717 	}
718 	splx(s);
719 
720 	/*
721 	 * Free left over swap blocks
722 	 */
723 	if (destroysource) {
724 		swap_pager_free_swap(srcobject);
725 
726 		if (srcobject->un_pager.swp.swp_allocsize) {
727 			printf("swap_pager_copy: *warning* pager with %d blocks (orig: %d)\n",
728 			    srcobject->un_pager.swp.swp_allocsize, origsize);
729 		}
730 
731 		free(srcobject->un_pager.swp.swp_blocks, M_VMPGDATA);
732 		srcobject->un_pager.swp.swp_blocks = NULL;
733 	}
734 	return;
735 }
736 
737 static void
738 swap_pager_dealloc(object)
739 	vm_object_t object;
740 {
741 	int s;
742 	sw_blk_t swb;
743 
744 	/*
745 	 * Remove from list right away so lookups will fail if we block for
746 	 * pageout completion.
747 	 */
748 	if (object->handle == NULL) {
749 		TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
750 	} else {
751 		TAILQ_REMOVE(&swap_pager_object_list, object, pager_object_list);
752 	}
753 
754 	/*
755 	 * Wait for all pageouts to finish and remove all entries from
756 	 * cleaning list.
757 	 */
758 
759 	s = splvm();
760 	while (object->un_pager.swp.swp_poip) {
761 		tsleep(object, PVM, "swpout", 0);
762 	}
763 	splx(s);
764 
765 	if (swap_pager_free_pending)
766 		swap_pager_sync();
767 
768 	/*
769 	 * Free left over swap blocks
770 	 */
771 	swap_pager_free_swap(object);
772 
773 	if (object->un_pager.swp.swp_allocsize) {
774 		printf("swap_pager_dealloc: *warning* freeing pager with %d blocks\n",
775 		    object->un_pager.swp.swp_allocsize);
776 	}
777 	swb = object->un_pager.swp.swp_blocks;
778 	if (swb) {
779 		/*
780    		* Free swap management resources
781    		*/
782 		free(swb, M_VMPGDATA);
783 		object->un_pager.swp.swp_blocks = NULL;
784 	}
785 }
786 
787 static __inline int
788 swap_pager_block_index(pindex)
789 	vm_pindex_t pindex;
790 {
791 	return (pindex / SWB_NPAGES);
792 }
793 
794 static __inline int
795 swap_pager_block_offset(pindex)
796 	vm_pindex_t pindex;
797 {
798 	return (pindex % SWB_NPAGES);
799 }
800 
801 /*
802  * swap_pager_haspage returns TRUE if the pager has data that has
803  * been written out.
804  */
805 static boolean_t
806 swap_pager_haspage(object, pindex, before, after)
807 	vm_object_t object;
808 	vm_pindex_t pindex;
809 	int *before;
810 	int *after;
811 {
812 	register sw_blk_t swb;
813 	int ix;
814 
815 	if (before != NULL)
816 		*before = 0;
817 	if (after != NULL)
818 		*after = 0;
819 	ix = pindex / SWB_NPAGES;
820 	if (ix >= object->un_pager.swp.swp_nblocks) {
821 		return (FALSE);
822 	}
823 	swb = &object->un_pager.swp.swp_blocks[ix];
824 	ix = pindex % SWB_NPAGES;
825 
826 	if (swb->swb_block[ix] != SWB_EMPTY) {
827 
828 		if (swb->swb_valid & (1 << ix)) {
829 			int tix;
830 			if (before) {
831 				for(tix = ix - 1; tix >= 0; --tix) {
832 					if ((swb->swb_valid & (1 << tix)) == 0)
833 						break;
834 					if ((swb->swb_block[tix] +
835 						(ix - tix) * (PAGE_SIZE/DEV_BSIZE)) !=
836 						swb->swb_block[ix])
837 						break;
838 					(*before)++;
839 				}
840 			}
841 
842 			if (after) {
843 				for(tix = ix + 1; tix < SWB_NPAGES; tix++) {
844 					if ((swb->swb_valid & (1 << tix)) == 0)
845 						break;
846 					if ((swb->swb_block[tix] -
847 						(tix - ix) * (PAGE_SIZE/DEV_BSIZE)) !=
848 						swb->swb_block[ix])
849 						break;
850 					(*after)++;
851 				}
852 			}
853 
854 			return TRUE;
855 		}
856 	}
857 	return (FALSE);
858 }
859 
860 /*
861  * Wakeup based upon spc state
862  */
863 static void
864 spc_wakeup(void)
865 {
866 	if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) {
867 		swap_pager_needflags &= ~SWAP_FREE_NEEDED_BY_PAGEOUT;
868 		wakeup(&swap_pager_needflags);
869 	} else if ((swap_pager_needflags & SWAP_FREE_NEEDED) &&
870 		swap_pager_free_count >= ((2 * npendingio) / 3)) {
871 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
872 		wakeup(&swap_pager_free);
873 	}
874 }
875 
876 /*
877  * Free an spc structure
878  */
879 static void
880 spc_free(spc)
881 	swp_clean_t spc;
882 {
883 	spc->spc_flags = 0;
884 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
885 	swap_pager_free_count++;
886 	if (swap_pager_needflags) {
887 		spc_wakeup();
888 	}
889 }
890 
891 /*
892  * swap_pager_ridpages is a convienience routine that deallocates all
893  * but the required page.  this is usually used in error returns that
894  * need to invalidate the "extra" readahead pages.
895  */
896 static void
897 swap_pager_ridpages(m, count, reqpage)
898 	vm_page_t *m;
899 	int count;
900 	int reqpage;
901 {
902 	int i;
903 
904 	for (i = 0; i < count; i++) {
905 		if (i != reqpage) {
906 			vm_page_free(m[i]);
907 		}
908 	}
909 }
910 
911 /*
912  * swap_pager_iodone1 is the completion routine for both reads and async writes
913  */
914 static void
915 swap_pager_iodone1(bp)
916 	struct buf *bp;
917 {
918 	bp->b_flags |= B_DONE;
919 	bp->b_flags &= ~B_ASYNC;
920 	wakeup(bp);
921 }
922 
923 static int
924 swap_pager_getpages(object, m, count, reqpage)
925 	vm_object_t object;
926 	vm_page_t *m;
927 	int count, reqpage;
928 {
929 	register struct buf *bp;
930 	sw_blk_t swb[count];
931 	register int s;
932 	int i;
933 	boolean_t rv;
934 	vm_offset_t kva, off[count];
935 	vm_pindex_t paging_offset;
936 	int reqaddr[count];
937 	int sequential;
938 
939 	int first, last;
940 	int failed;
941 	int reqdskregion;
942 
943 	object = m[reqpage]->object;
944 	paging_offset = OFF_TO_IDX(object->paging_offset);
945 	sequential = (m[reqpage]->pindex == (object->last_read + 1));
946 
947 	for (i = 0; i < count; i++) {
948 		vm_pindex_t fidx = m[i]->pindex + paging_offset;
949 		int ix = swap_pager_block_index(fidx);
950 
951 		if (ix >= object->un_pager.swp.swp_nblocks) {
952 			int j;
953 
954 			if (i <= reqpage) {
955 				swap_pager_ridpages(m, count, reqpage);
956 				return (VM_PAGER_FAIL);
957 			}
958 			for (j = i; j < count; j++) {
959 				vm_page_free(m[j]);
960 			}
961 			count = i;
962 			break;
963 		}
964 		swb[i] = &object->un_pager.swp.swp_blocks[ix];
965 		off[i] = swap_pager_block_offset(fidx);
966 		reqaddr[i] = swb[i]->swb_block[off[i]];
967 	}
968 
969 	/* make sure that our required input request is existant */
970 
971 	if (reqaddr[reqpage] == SWB_EMPTY ||
972 	    (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
973 		swap_pager_ridpages(m, count, reqpage);
974 		return (VM_PAGER_FAIL);
975 	}
976 	reqdskregion = reqaddr[reqpage] / dmmax;
977 
978 	/*
979 	 * search backwards for the first contiguous page to transfer
980 	 */
981 	failed = 0;
982 	first = 0;
983 	for (i = reqpage - 1; i >= 0; --i) {
984 		if (sequential || failed || (reqaddr[i] == SWB_EMPTY) ||
985 		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
986 		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
987 		    ((reqaddr[i] / dmmax) != reqdskregion)) {
988 			failed = 1;
989 			vm_page_free(m[i]);
990 			if (first == 0)
991 				first = i + 1;
992 		}
993 	}
994 	/*
995 	 * search forwards for the last contiguous page to transfer
996 	 */
997 	failed = 0;
998 	last = count;
999 	for (i = reqpage + 1; i < count; i++) {
1000 		if (failed || (reqaddr[i] == SWB_EMPTY) ||
1001 		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
1002 		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
1003 		    ((reqaddr[i] / dmmax) != reqdskregion)) {
1004 			failed = 1;
1005 			vm_page_free(m[i]);
1006 			if (last == count)
1007 				last = i;
1008 		}
1009 	}
1010 
1011 	count = last;
1012 	if (first != 0) {
1013 		for (i = first; i < count; i++) {
1014 			m[i - first] = m[i];
1015 			reqaddr[i - first] = reqaddr[i];
1016 			off[i - first] = off[i];
1017 		}
1018 		count -= first;
1019 		reqpage -= first;
1020 	}
1021 	++swb[reqpage]->swb_locked;
1022 
1023 	/*
1024 	 * at this point: "m" is a pointer to the array of vm_page_t for
1025 	 * paging I/O "count" is the number of vm_page_t entries represented
1026 	 * by "m" "object" is the vm_object_t for I/O "reqpage" is the index
1027 	 * into "m" for the page actually faulted
1028 	 */
1029 
1030 	/*
1031 	 * Get a swap buffer header to perform the IO
1032 	 */
1033 	bp = getpbuf();
1034 	kva = (vm_offset_t) bp->b_data;
1035 
1036 	/*
1037 	 * map our page(s) into kva for input
1038 	 */
1039 	pmap_qenter(kva, m, count);
1040 
1041 	bp->b_flags = B_BUSY | B_READ | B_CALL | B_PAGING;
1042 	bp->b_iodone = swap_pager_iodone1;
1043 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1044 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1045 	crhold(bp->b_rcred);
1046 	crhold(bp->b_wcred);
1047 	bp->b_data = (caddr_t) kva;
1048 	bp->b_blkno = reqaddr[0];
1049 	bp->b_bcount = PAGE_SIZE * count;
1050 	bp->b_bufsize = PAGE_SIZE * count;
1051 
1052 	pbgetvp(swapdev_vp, bp);
1053 
1054 	cnt.v_swapin++;
1055 	cnt.v_swappgsin += count;
1056 	/*
1057 	 * perform the I/O
1058 	 */
1059 	VOP_STRATEGY(bp->b_vp, bp);
1060 
1061 	/*
1062 	 * wait for the sync I/O to complete
1063 	 */
1064 	s = splvm();
1065 	while ((bp->b_flags & B_DONE) == 0) {
1066 		if (tsleep(bp, PVM, "swread", hz*20)) {
1067 			printf(
1068 "swap_pager: indefinite wait buffer: device: %#lx, blkno: %ld, size: %ld\n",
1069 			    (u_long)bp->b_dev, (long)bp->b_blkno,
1070 			    (long)bp->b_bcount);
1071 		}
1072 	}
1073 
1074 	if (bp->b_flags & B_ERROR) {
1075 		printf(
1076 "swap_pager: I/O error - pagein failed; blkno %ld, size %ld, error %d\n",
1077 		    (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error);
1078 		rv = VM_PAGER_ERROR;
1079 	} else {
1080 		rv = VM_PAGER_OK;
1081 	}
1082 
1083 	splx(s);
1084 	swb[reqpage]->swb_locked--;
1085 
1086 	/*
1087 	 * remove the mapping for kernel virtual
1088 	 */
1089 	pmap_qremove(kva, count);
1090 
1091 	/*
1092 	 * release the physical I/O buffer
1093 	 */
1094 	relpbuf(bp);
1095 	/*
1096 	 * finish up input if everything is ok
1097 	 */
1098 	if (rv == VM_PAGER_OK) {
1099 		for (i = 0; i < count; i++) {
1100 			m[i]->dirty = 0;
1101 			vm_page_flag_clear(m[i], PG_ZERO);
1102 			if (i != reqpage) {
1103 				/*
1104 				 * whether or not to leave the page
1105 				 * activated is up in the air, but we
1106 				 * should put the page on a page queue
1107 				 * somewhere. (it already is in the
1108 				 * object). After some emperical
1109 				 * results, it is best to deactivate
1110 				 * the readahead pages.
1111 				 */
1112 				vm_page_deactivate(m[i]);
1113 
1114 				/*
1115 				 * just in case someone was asking for
1116 				 * this page we now tell them that it
1117 				 * is ok to use
1118 				 */
1119 				m[i]->valid = VM_PAGE_BITS_ALL;
1120 				vm_page_wakeup(m[i]);
1121 			}
1122 		}
1123 
1124 		m[reqpage]->object->last_read = m[count-1]->pindex;
1125 	} else {
1126 		swap_pager_ridpages(m, count, reqpage);
1127 	}
1128 	return (rv);
1129 }
1130 
1131 int
1132 swap_pager_putpages(object, m, count, sync, rtvals)
1133 	vm_object_t object;
1134 	vm_page_t *m;
1135 	int count;
1136 	boolean_t sync;
1137 	int *rtvals;
1138 {
1139 	register struct buf *bp;
1140 	sw_blk_t swb[count];
1141 	register int s;
1142 	int i, j, ix, firstidx, lastidx;
1143 	boolean_t rv;
1144 	vm_offset_t kva, off, fidx;
1145 	swp_clean_t spc;
1146 	vm_pindex_t paging_pindex;
1147 	int reqaddr[count];
1148 	int failed;
1149 
1150 	if (vm_swap_size)
1151 		no_swap_space = 0;
1152 
1153 	if (no_swap_space) {
1154 		for (i = 0; i < count; i++)
1155 			rtvals[i] = VM_PAGER_FAIL;
1156 		return VM_PAGER_FAIL;
1157 	}
1158 
1159 	if (curproc != pageproc)
1160 		sync = TRUE;
1161 
1162 	object = m[0]->object;
1163 	paging_pindex = OFF_TO_IDX(object->paging_offset);
1164 
1165 	failed = 0;
1166 	for (j = 0; j < count; j++) {
1167 		fidx = m[j]->pindex + paging_pindex;
1168 		ix = swap_pager_block_index(fidx);
1169 		swb[j] = 0;
1170 		if (ix >= object->un_pager.swp.swp_nblocks) {
1171 			rtvals[j] = VM_PAGER_FAIL;
1172 			failed = 1;
1173 			continue;
1174 		} else {
1175 			rtvals[j] = VM_PAGER_OK;
1176 		}
1177 		swb[j] = &object->un_pager.swp.swp_blocks[ix];
1178 		swb[j]->swb_locked++;
1179 		if (failed) {
1180 			rtvals[j] = VM_PAGER_FAIL;
1181 			continue;
1182 		}
1183 		off = swap_pager_block_offset(fidx);
1184 		reqaddr[j] = swb[j]->swb_block[off];
1185 		if (reqaddr[j] == SWB_EMPTY) {
1186 			daddr_t blk;
1187 			int tries;
1188 			int ntoget;
1189 
1190 			tries = 0;
1191 			s = splvm();
1192 
1193 			/*
1194 			 * if any other pages have been allocated in this
1195 			 * block, we only try to get one page.
1196 			 */
1197 			for (i = 0; i < SWB_NPAGES; i++) {
1198 				if (swb[j]->swb_block[i] != SWB_EMPTY)
1199 					break;
1200 			}
1201 
1202 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
1203 			/*
1204 			 * this code is alittle conservative, but works (the
1205 			 * intent of this code is to allocate small chunks for
1206 			 * small objects)
1207 			 */
1208 			if ((off == 0) && ((fidx + ntoget) > object->size)) {
1209 				ntoget = object->size - fidx;
1210 			}
1211 	retrygetspace:
1212 			if (!swap_pager_full && ntoget > 1 &&
1213 			    swap_pager_getswapspace(object, ntoget * btodb(PAGE_SIZE),
1214 				&blk)) {
1215 
1216 				for (i = 0; i < ntoget; i++) {
1217 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
1218 					swb[j]->swb_valid = 0;
1219 				}
1220 
1221 				reqaddr[j] = swb[j]->swb_block[off];
1222 			} else if (!swap_pager_getswapspace(object, btodb(PAGE_SIZE),
1223 				&swb[j]->swb_block[off])) {
1224 				/*
1225 				 * if the allocation has failed, we try to
1226 				 * reclaim space and retry.
1227 				 */
1228 				if (++tries == 1) {
1229 					swap_pager_reclaim();
1230 					goto retrygetspace;
1231 				}
1232 				rtvals[j] = VM_PAGER_AGAIN;
1233 				failed = 1;
1234 				swap_pager_full = 1;
1235 			} else {
1236 				reqaddr[j] = swb[j]->swb_block[off];
1237 				swb[j]->swb_valid &= ~(1 << off);
1238 			}
1239 			splx(s);
1240 		}
1241 	}
1242 
1243 	/*
1244 	 * search forwards for the last contiguous page to transfer
1245 	 */
1246 	failed = 0;
1247 	for (i = 0; i < count; i++) {
1248 		if (failed ||
1249 			(reqaddr[i] != reqaddr[0] + i * btodb(PAGE_SIZE)) ||
1250 		    ((reqaddr[i] / dmmax) != (reqaddr[0] / dmmax)) ||
1251 		    (rtvals[i] != VM_PAGER_OK)) {
1252 			failed = 1;
1253 			if (rtvals[i] == VM_PAGER_OK)
1254 				rtvals[i] = VM_PAGER_AGAIN;
1255 		}
1256 	}
1257 
1258 	ix = 0;
1259 	firstidx = -1;
1260 	for (i = 0; i < count; i++) {
1261 		if (rtvals[i] == VM_PAGER_OK) {
1262 			ix++;
1263 			if (firstidx == -1) {
1264 				firstidx = i;
1265 			}
1266 		} else if (firstidx >= 0) {
1267 			break;
1268 		}
1269 	}
1270 
1271 	if (firstidx == -1) {
1272 		for (i = 0; i < count; i++) {
1273 			if (rtvals[i] == VM_PAGER_OK)
1274 				rtvals[i] = VM_PAGER_AGAIN;
1275 		}
1276 		return VM_PAGER_AGAIN;
1277 	}
1278 
1279 	lastidx = firstidx + ix;
1280 
1281 	if (ix > max_pageout_cluster) {
1282 		for (i = firstidx + max_pageout_cluster; i < lastidx; i++) {
1283 			if (rtvals[i] == VM_PAGER_OK)
1284 				rtvals[i] = VM_PAGER_AGAIN;
1285 		}
1286 		ix = max_pageout_cluster;
1287 		lastidx = firstidx + ix;
1288 	}
1289 
1290 	for (i = 0; i < firstidx; i++) {
1291 		if (swb[i])
1292 			swb[i]->swb_locked--;
1293 	}
1294 
1295 	for (i = lastidx; i < count; i++) {
1296 		if (swb[i])
1297 			swb[i]->swb_locked--;
1298 	}
1299 
1300 #if defined(INVARIANTS)
1301 	for (i = firstidx; i < lastidx; i++) {
1302 		if (reqaddr[i] == SWB_EMPTY) {
1303 			printf("I/O to empty block???? -- pindex: %d, i: %d\n",
1304 				m[i]->pindex, i);
1305 		}
1306 	}
1307 #endif
1308 
1309 	/*
1310 	 * Clean up all completed async pageouts.
1311 	 */
1312 	if (swap_pager_free_pending)
1313 		swap_pager_sync();
1314 
1315 	/*
1316 	 * get a swap pager clean data structure, block until we get it
1317 	 */
1318 	if (curproc == pageproc) {
1319 		if (swap_pager_free_count == 0) {
1320 			s = splvm();
1321 			while (swap_pager_free_count == 0) {
1322 				swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT;
1323 			/*
1324 			 * if it does not get one within a short time, then
1325 			 * there is a potential deadlock, so we go-on trying
1326 			 * to free pages.  It is important to block here as opposed
1327 			 * to returning, thereby allowing the pageout daemon to continue.
1328 			 * It is likely that pageout daemon will start suboptimally
1329 			 * reclaiming vnode backed pages if we don't block.  Since the
1330 			 * I/O subsystem is probably already fully utilized, might as
1331 			 * well wait.
1332 			 */
1333 				if (tsleep(&swap_pager_needflags, PVM-1, "swpfre", hz/2)) {
1334 					if (swap_pager_free_pending)
1335 						swap_pager_sync();
1336 					if (swap_pager_free_count == 0) {
1337 						for (i = firstidx; i < lastidx; i++) {
1338 							rtvals[i] = VM_PAGER_AGAIN;
1339 						}
1340 						splx(s);
1341 						return VM_PAGER_AGAIN;
1342 					}
1343 				} else {
1344 					swap_pager_sync();
1345 				}
1346 			}
1347 			splx(s);
1348 		}
1349 
1350 		spc = TAILQ_FIRST(&swap_pager_free);
1351 		KASSERT(spc,
1352 		    ("swap_pager_putpages: free queue is empty, %d expected\n",
1353 		    swap_pager_free_count));
1354 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1355 		swap_pager_free_count--;
1356 
1357 		kva = spc->spc_kva;
1358 		bp = spc->spc_bp;
1359 		bzero(bp, sizeof *bp);
1360 		bp->b_spc = spc;
1361 		bp->b_xflags = 0;
1362 		bp->b_data = (caddr_t) kva;
1363 	} else {
1364 		spc = NULL;
1365 		bp = getpbuf();
1366 		kva = (vm_offset_t) bp->b_data;
1367 		bp->b_spc = NULL;
1368 	}
1369 
1370 	/*
1371 	 * map our page(s) into kva for I/O
1372 	 */
1373 	pmap_qenter(kva, &m[firstidx], ix);
1374 
1375 	/*
1376 	 * get the base I/O offset into the swap file
1377 	 */
1378 	for (i = firstidx; i < lastidx ; i++) {
1379 		fidx = m[i]->pindex + paging_pindex;
1380 		off = swap_pager_block_offset(fidx);
1381 		/*
1382 		 * set the valid bit
1383 		 */
1384 		swb[i]->swb_valid |= (1 << off);
1385 		/*
1386 		 * and unlock the data structure
1387 		 */
1388 		swb[i]->swb_locked--;
1389 	}
1390 
1391 	bp->b_flags = B_BUSY | B_PAGING;
1392 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1393 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1394 	if (bp->b_rcred != NOCRED)
1395 		crhold(bp->b_rcred);
1396 	if (bp->b_wcred != NOCRED)
1397 		crhold(bp->b_wcred);
1398 	bp->b_blkno = reqaddr[firstidx];
1399 	pbgetvp(swapdev_vp, bp);
1400 
1401 	bp->b_bcount = PAGE_SIZE * ix;
1402 	bp->b_bufsize = PAGE_SIZE * ix;
1403 
1404 	s = splvm();
1405 	swapdev_vp->v_numoutput++;
1406 
1407 	/*
1408 	 * If this is an async write we set up additional buffer fields and
1409   	 * place a "cleaning" entry on the inuse queue.
1410   	 */
1411  	object->un_pager.swp.swp_poip++;
1412 
1413  	if (spc) {
1414   		spc->spc_flags = 0;
1415   		spc->spc_object = object;
1416  		bp->b_npages = ix;
1417  		for (i = firstidx; i < lastidx; i++) {
1418   			spc->spc_m[i] = m[i];
1419  			bp->b_pages[i - firstidx] = m[i];
1420  			vm_page_protect(m[i], VM_PROT_READ);
1421  			pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1422  			m[i]->dirty = 0;
1423  		}
1424   		spc->spc_first = firstidx;
1425   		spc->spc_count = ix;
1426 		/*
1427 		 * the completion routine for async writes
1428 		 */
1429 		bp->b_flags |= B_CALL;
1430 		bp->b_iodone = swap_pager_iodone;
1431 		bp->b_dirtyoff = 0;
1432 		bp->b_dirtyend = bp->b_bcount;
1433 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
1434 	} else {
1435 		bp->b_flags |= B_CALL;
1436 		bp->b_iodone = swap_pager_iodone1;
1437 		bp->b_npages = ix;
1438 		for (i = firstidx; i < lastidx; i++)
1439 			bp->b_pages[i - firstidx] = m[i];
1440 	}
1441 
1442 	cnt.v_swapout++;
1443 	cnt.v_swappgsout += ix;
1444 
1445 	/*
1446 	 * perform the I/O
1447 	 */
1448 	VOP_STRATEGY(bp->b_vp, bp);
1449 	if (sync == FALSE) {
1450 		if (swap_pager_free_pending) {
1451 			swap_pager_sync();
1452 		}
1453 		for (i = firstidx; i < lastidx; i++) {
1454 			rtvals[i] = VM_PAGER_PEND;
1455 		}
1456 		splx(s);
1457 		return VM_PAGER_PEND;
1458 	}
1459 
1460 	/*
1461 	 * wait for the sync I/O to complete
1462 	 */
1463 	while ((bp->b_flags & B_DONE) == 0) {
1464 		tsleep(bp, PVM, "swwrt", 0);
1465 	}
1466 
1467 	if (bp->b_flags & B_ERROR) {
1468 		printf(
1469 "swap_pager: I/O error - pageout failed; blkno %ld, size %ld, error %d\n",
1470 		    (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error);
1471 		rv = VM_PAGER_ERROR;
1472 	} else {
1473 		rv = VM_PAGER_OK;
1474 	}
1475 
1476 	object->un_pager.swp.swp_poip--;
1477 	if (object->un_pager.swp.swp_poip == 0)
1478 		wakeup(object);
1479 
1480 	if (bp->b_vp)
1481 		pbrelvp(bp);
1482 
1483 	splx(s);
1484 
1485 	/*
1486 	 * remove the mapping for kernel virtual
1487 	 */
1488 	pmap_qremove(kva, ix);
1489 
1490 	/*
1491 	 * if we have written the page, then indicate that the page is clean.
1492 	 */
1493 	if (rv == VM_PAGER_OK) {
1494 		for (i = firstidx; i < lastidx; i++) {
1495 			if (rtvals[i] == VM_PAGER_OK) {
1496 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1497 				m[i]->dirty = 0;
1498 				/*
1499 				 * optimization, if a page has been read
1500 				 * during the pageout process, we activate it.
1501 				 */
1502 				if (((m[i]->flags & (PG_WANTED|PG_REFERENCED)) ||
1503 				    pmap_ts_referenced(VM_PAGE_TO_PHYS(m[i])))) {
1504 					vm_page_activate(m[i]);
1505 				}
1506 			}
1507 		}
1508 	} else {
1509 		for (i = firstidx; i < lastidx; i++) {
1510 			rtvals[i] = rv;
1511 		}
1512 	}
1513 
1514 	if (spc != NULL) {
1515 		if (bp->b_rcred != NOCRED)
1516 			crfree(bp->b_rcred);
1517 		if (bp->b_wcred != NOCRED)
1518 			crfree(bp->b_wcred);
1519 		spc_free(spc);
1520 	} else
1521 		relpbuf(bp);
1522 	if (swap_pager_free_pending)
1523 		swap_pager_sync();
1524 
1525 	return (rv);
1526 }
1527 
1528 void
1529 swap_pager_sync()
1530 {
1531 	swp_clean_t spc;
1532 
1533 	while (spc = TAILQ_FIRST(&swap_pager_done)) {
1534 		swap_pager_finish(spc);
1535 	}
1536 	return;
1537 }
1538 
1539 static void
1540 swap_pager_finish(spc)
1541 	register swp_clean_t spc;
1542 {
1543 	int i, s, lastidx;
1544 	vm_object_t object;
1545 	vm_page_t *ma;
1546 
1547 	ma = spc->spc_m;
1548 	object = spc->spc_object;
1549 	lastidx = spc->spc_first + spc->spc_count;
1550 
1551 	s = splvm();
1552 	TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
1553 	splx(s);
1554 
1555 	pmap_qremove(spc->spc_kva, spc->spc_count);
1556 
1557 	/*
1558 	 * If no error, mark as clean and inform the pmap system. If error,
1559 	 * mark as dirty so we will try again. (XXX could get stuck doing
1560 	 * this, should give up after awhile)
1561 	 */
1562 	if (spc->spc_flags & SPC_ERROR) {
1563 
1564 		for (i = spc->spc_first; i < lastidx; i++) {
1565 			printf("swap_pager_finish: I/O error, clean of page %lx failed\n",
1566 			    (u_long) VM_PAGE_TO_PHYS(ma[i]));
1567 			ma[i]->dirty = VM_PAGE_BITS_ALL;
1568 			vm_page_io_finish(ma[i]);
1569 		}
1570 
1571 		vm_object_pip_subtract(object, spc->spc_count);
1572 		if ((object->paging_in_progress == 0) &&
1573 			(object->flags & OBJ_PIPWNT)) {
1574 			vm_object_clear_flag(object, OBJ_PIPWNT);
1575 			wakeup(object);
1576 		}
1577 
1578 	} else {
1579 		for (i = spc->spc_first; i < lastidx; i++) {
1580 			if ((ma[i]->queue != PQ_ACTIVE) &&
1581 			   ((ma[i]->flags & PG_WANTED) ||
1582 				 pmap_ts_referenced(VM_PAGE_TO_PHYS(ma[i])))) {
1583 				vm_page_activate(ma[i]);
1584 			}
1585 		}
1586 	}
1587 
1588 	nswiodone -= spc->spc_count;
1589 	swap_pager_free_pending--;
1590 	spc_free(spc);
1591 
1592 	return;
1593 }
1594 
1595 /*
1596  * swap_pager_iodone
1597  */
1598 static void
1599 swap_pager_iodone(bp)
1600 	register struct buf *bp;
1601 {
1602 	int i, s, lastidx;
1603 	register swp_clean_t spc;
1604 	vm_object_t object;
1605 	vm_page_t *ma;
1606 
1607 
1608 	s = splvm();
1609 	spc = (swp_clean_t) bp->b_spc;
1610 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
1611 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
1612 
1613 	object = spc->spc_object;
1614 
1615 #if defined(DIAGNOSTIC)
1616 	if (object->paging_in_progress < spc->spc_count)
1617 		printf("swap_pager_iodone: paging_in_progress(%d) < spc_count(%d)\n",
1618 			object->paging_in_progress, spc->spc_count);
1619 #endif
1620 
1621 	if (bp->b_flags & B_ERROR) {
1622 		spc->spc_flags |= SPC_ERROR;
1623 		printf("swap_pager: I/O error - async %s failed; blkno %lu, size %ld, error %d\n",
1624 		    (bp->b_flags & B_READ) ? "pagein" : "pageout",
1625 		    (u_long) bp->b_blkno, bp->b_bcount, bp->b_error);
1626 	} else {
1627 		vm_object_pip_subtract(object, spc->spc_count);
1628 		if ((object->paging_in_progress == 0) &&
1629 			(object->flags & OBJ_PIPWNT)) {
1630 			vm_object_clear_flag(object, OBJ_PIPWNT);
1631 			wakeup(object);
1632 		}
1633 		ma = spc->spc_m;
1634 		lastidx = spc->spc_first + spc->spc_count;
1635 		for (i = spc->spc_first; i < lastidx; i++) {
1636 			/*
1637 			 * we wakeup any processes that are waiting on these pages.
1638 			 */
1639 			vm_page_io_finish(ma[i]);
1640 		}
1641 	}
1642 
1643 	if (bp->b_vp)
1644 		pbrelvp(bp);
1645 
1646 	if (bp->b_rcred != NOCRED)
1647 		crfree(bp->b_rcred);
1648 	if (bp->b_wcred != NOCRED)
1649 		crfree(bp->b_wcred);
1650 
1651 	nswiodone += spc->spc_count;
1652 	swap_pager_free_pending++;
1653 	if (--spc->spc_object->un_pager.swp.swp_poip == 0) {
1654 		wakeup(spc->spc_object);
1655 	}
1656 
1657 	if (swap_pager_needflags &&
1658 	  ((swap_pager_free_count + swap_pager_free_pending) > (npendingio / 2))) {
1659 		spc_wakeup();
1660 	}
1661 
1662 	if ((TAILQ_FIRST(&swap_pager_inuse) == NULL) &&
1663 		vm_pageout_pages_needed) {
1664 		wakeup(&vm_pageout_pages_needed);
1665 		vm_pageout_pages_needed = 0;
1666 	}
1667 
1668 	splx(s);
1669 }
1670