xref: /freebsd/sys/vm/swap_pager.c (revision df8bae1de4b67ccf57f4afebd4e2bf258c38910d)
1 /*
2  * Copyright (c) 1990 University of Utah.
3  * Copyright (c) 1991, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the Systems Programming Group of the University of Utah Computer
8  * Science Department.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
39  *
40  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
41  */
42 
43 /*
44  * Quick hack to page to dedicated partition(s).
45  * TODO:
46  *	Add multiprocessor locks
47  *	Deal with async writes in a better fashion
48  */
49 
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/proc.h>
53 #include <sys/buf.h>
54 #include <sys/map.h>
55 #include <sys/vnode.h>
56 #include <sys/malloc.h>
57 
58 #include <miscfs/specfs/specdev.h>
59 
60 #include <vm/vm.h>
61 #include <vm/vm_page.h>
62 #include <vm/vm_pageout.h>
63 #include <vm/swap_pager.h>
64 
65 #define NSWSIZES	16	/* size of swtab */
66 #define MAXDADDRS	64	/* max # of disk addrs for fixed allocations */
67 #ifndef NPENDINGIO
68 #define NPENDINGIO	64	/* max # of pending cleans */
69 #endif
70 
71 #ifdef DEBUG
72 int	swpagerdebug = 0x100;
73 #define	SDB_FOLLOW	0x001
74 #define SDB_INIT	0x002
75 #define SDB_ALLOC	0x004
76 #define SDB_IO		0x008
77 #define SDB_WRITE	0x010
78 #define SDB_FAIL	0x020
79 #define SDB_ALLOCBLK	0x040
80 #define SDB_FULL	0x080
81 #define SDB_ANOM	0x100
82 #define SDB_ANOMPANIC	0x200
83 #define SDB_CLUSTER	0x400
84 #define SDB_PARANOIA	0x800
85 #endif
86 
87 TAILQ_HEAD(swpclean, swpagerclean);
88 
89 struct swpagerclean {
90 	TAILQ_ENTRY(swpagerclean)	spc_list;
91 	int				spc_flags;
92 	struct buf			*spc_bp;
93 	sw_pager_t			spc_swp;
94 	vm_offset_t			spc_kva;
95 	vm_page_t			spc_m;
96 	int				spc_npages;
97 } swcleanlist[NPENDINGIO];
98 typedef struct swpagerclean *swp_clean_t;
99 
100 /* spc_flags values */
101 #define SPC_FREE	0x00
102 #define SPC_BUSY	0x01
103 #define SPC_DONE	0x02
104 #define SPC_ERROR	0x04
105 
106 struct swtab {
107 	vm_size_t st_osize;	/* size of object (bytes) */
108 	int	  st_bsize;	/* vs. size of swap block (DEV_BSIZE units) */
109 #ifdef DEBUG
110 	u_long	  st_inuse;	/* number in this range in use */
111 	u_long	  st_usecnt;	/* total used of this size */
112 #endif
113 } swtab[NSWSIZES+1];
114 
115 #ifdef DEBUG
116 int		swap_pager_poip;	/* pageouts in progress */
117 int		swap_pager_piip;	/* pageins in progress */
118 #endif
119 
120 int		swap_pager_maxcluster;	/* maximum cluster size */
121 int		swap_pager_npendingio;	/* number of pager clean structs */
122 
123 struct swpclean	swap_pager_inuse;	/* list of pending page cleans */
124 struct swpclean	swap_pager_free;	/* list of free pager clean structs */
125 struct pagerlst	swap_pager_list;	/* list of "named" anon regions */
126 
127 static void 		swap_pager_init __P((void));
128 static vm_pager_t	swap_pager_alloc
129 			    __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t));
130 static void		swap_pager_clean __P((int));
131 #ifdef DEBUG
132 static void		swap_pager_clean_check __P((vm_page_t *, int, int));
133 #endif
134 static void		swap_pager_cluster
135 			    __P((vm_pager_t, vm_offset_t,
136 				 vm_offset_t *, vm_offset_t *));
137 static void		swap_pager_dealloc __P((vm_pager_t));
138 static int		swap_pager_getpage
139 			    __P((vm_pager_t, vm_page_t *, int, boolean_t));
140 static boolean_t	swap_pager_haspage __P((vm_pager_t, vm_offset_t));
141 static int		swap_pager_io __P((sw_pager_t, vm_page_t *, int, int));
142 static void		swap_pager_iodone __P((struct buf *));
143 static int		swap_pager_putpage
144 			    __P((vm_pager_t, vm_page_t *, int, boolean_t));
145 
146 struct pagerops swappagerops = {
147 	swap_pager_init,
148 	swap_pager_alloc,
149 	swap_pager_dealloc,
150 	swap_pager_getpage,
151 	swap_pager_putpage,
152 	swap_pager_haspage,
153 	swap_pager_cluster
154 };
155 
156 static void
157 swap_pager_init()
158 {
159 	register swp_clean_t spc;
160 	register int i, bsize;
161 	extern int dmmin, dmmax;
162 	int maxbsize;
163 
164 #ifdef DEBUG
165 	if (swpagerdebug & (SDB_FOLLOW|SDB_INIT))
166 		printf("swpg_init()\n");
167 #endif
168 	dfltpagerops = &swappagerops;
169 	TAILQ_INIT(&swap_pager_list);
170 
171 	/*
172 	 * Allocate async IO structures.
173 	 *
174 	 * XXX it would be nice if we could do this dynamically based on
175 	 * the value of nswbuf (since we are ultimately limited by that)
176 	 * but neither nswbuf or malloc has been initialized yet.  So the
177 	 * structs are statically allocated above.
178 	 */
179 	swap_pager_npendingio = NPENDINGIO;
180 
181 	/*
182 	 * Initialize clean lists
183 	 */
184 	TAILQ_INIT(&swap_pager_inuse);
185 	TAILQ_INIT(&swap_pager_free);
186 	for (i = 0, spc = swcleanlist; i < swap_pager_npendingio; i++, spc++) {
187 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
188 		spc->spc_flags = SPC_FREE;
189 	}
190 
191 	/*
192 	 * Calculate the swap allocation constants.
193 	 */
194         if (dmmin == 0) {
195                 dmmin = DMMIN;
196 		if (dmmin < CLBYTES/DEV_BSIZE)
197 			dmmin = CLBYTES/DEV_BSIZE;
198 	}
199         if (dmmax == 0)
200                 dmmax = DMMAX;
201 
202 	/*
203 	 * Fill in our table of object size vs. allocation size
204 	 */
205 	bsize = btodb(PAGE_SIZE);
206 	if (bsize < dmmin)
207 		bsize = dmmin;
208 	maxbsize = btodb(sizeof(sw_bm_t) * NBBY * PAGE_SIZE);
209 	if (maxbsize > dmmax)
210 		maxbsize = dmmax;
211 	for (i = 0; i < NSWSIZES; i++) {
212 		swtab[i].st_osize = (vm_size_t) (MAXDADDRS * dbtob(bsize));
213 		swtab[i].st_bsize = bsize;
214 		if (bsize <= btodb(MAXPHYS))
215 			swap_pager_maxcluster = dbtob(bsize);
216 #ifdef DEBUG
217 		if (swpagerdebug & SDB_INIT)
218 			printf("swpg_init: ix %d, size %x, bsize %x\n",
219 			       i, swtab[i].st_osize, swtab[i].st_bsize);
220 #endif
221 		if (bsize >= maxbsize)
222 			break;
223 		bsize *= 2;
224 	}
225 	swtab[i].st_osize = 0;
226 	swtab[i].st_bsize = bsize;
227 }
228 
229 /*
230  * Allocate a pager structure and associated resources.
231  * Note that if we are called from the pageout daemon (handle == NULL)
232  * we should not wait for memory as it could resulting in deadlock.
233  */
234 static vm_pager_t
235 swap_pager_alloc(handle, size, prot, foff)
236 	caddr_t handle;
237 	register vm_size_t size;
238 	vm_prot_t prot;
239 	vm_offset_t foff;
240 {
241 	register vm_pager_t pager;
242 	register sw_pager_t swp;
243 	struct swtab *swt;
244 	int waitok;
245 
246 #ifdef DEBUG
247 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOC))
248 		printf("swpg_alloc(%x, %x, %x)\n", handle, size, prot);
249 #endif
250 	/*
251 	 * If this is a "named" anonymous region, look it up and
252 	 * return the appropriate pager if it exists.
253 	 */
254 	if (handle) {
255 		pager = vm_pager_lookup(&swap_pager_list, handle);
256 		if (pager != NULL) {
257 			/*
258 			 * Use vm_object_lookup to gain a reference
259 			 * to the object and also to remove from the
260 			 * object cache.
261 			 */
262 			if (vm_object_lookup(pager) == NULL)
263 				panic("swap_pager_alloc: bad object");
264 			return(pager);
265 		}
266 	}
267 	/*
268 	 * Pager doesn't exist, allocate swap management resources
269 	 * and initialize.
270 	 */
271 	waitok = handle ? M_WAITOK : M_NOWAIT;
272 	pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok);
273 	if (pager == NULL)
274 		return(NULL);
275 	swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok);
276 	if (swp == NULL) {
277 #ifdef DEBUG
278 		if (swpagerdebug & SDB_FAIL)
279 			printf("swpg_alloc: swpager malloc failed\n");
280 #endif
281 		free((caddr_t)pager, M_VMPAGER);
282 		return(NULL);
283 	}
284 	size = round_page(size);
285 	for (swt = swtab; swt->st_osize; swt++)
286 		if (size <= swt->st_osize)
287 			break;
288 #ifdef DEBUG
289 	swt->st_inuse++;
290 	swt->st_usecnt++;
291 #endif
292 	swp->sw_osize = size;
293 	swp->sw_bsize = swt->st_bsize;
294 	swp->sw_nblocks = (btodb(size) + swp->sw_bsize - 1) / swp->sw_bsize;
295 	swp->sw_blocks = (sw_blk_t)
296 		malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks),
297 		       M_VMPGDATA, M_NOWAIT);
298 	if (swp->sw_blocks == NULL) {
299 		free((caddr_t)swp, M_VMPGDATA);
300 		free((caddr_t)pager, M_VMPAGER);
301 #ifdef DEBUG
302 		if (swpagerdebug & SDB_FAIL)
303 			printf("swpg_alloc: sw_blocks malloc failed\n");
304 		swt->st_inuse--;
305 		swt->st_usecnt--;
306 #endif
307 		return(FALSE);
308 	}
309 	bzero((caddr_t)swp->sw_blocks,
310 	      swp->sw_nblocks * sizeof(*swp->sw_blocks));
311 	swp->sw_poip = 0;
312 	if (handle) {
313 		vm_object_t object;
314 
315 		swp->sw_flags = SW_NAMED;
316 		TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list);
317 		/*
318 		 * Consistant with other pagers: return with object
319 		 * referenced.  Can't do this with handle == NULL
320 		 * since it might be the pageout daemon calling.
321 		 */
322 		object = vm_object_allocate(size);
323 		vm_object_enter(object, pager);
324 		vm_object_setpager(object, pager, 0, FALSE);
325 	} else {
326 		swp->sw_flags = 0;
327 		pager->pg_list.tqe_next = NULL;
328 		pager->pg_list.tqe_prev = NULL;
329 	}
330 	pager->pg_handle = handle;
331 	pager->pg_ops = &swappagerops;
332 	pager->pg_type = PG_SWAP;
333 	pager->pg_flags = PG_CLUSTERPUT;
334 	pager->pg_data = swp;
335 
336 #ifdef DEBUG
337 	if (swpagerdebug & SDB_ALLOC)
338 		printf("swpg_alloc: pg_data %x, %x of %x at %x\n",
339 		       swp, swp->sw_nblocks, swp->sw_bsize, swp->sw_blocks);
340 #endif
341 	return(pager);
342 }
343 
344 static void
345 swap_pager_dealloc(pager)
346 	vm_pager_t pager;
347 {
348 	register int i;
349 	register sw_blk_t bp;
350 	register sw_pager_t swp;
351 	struct swtab *swt;
352 	int s;
353 
354 #ifdef DEBUG
355 	/* save panic time state */
356 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
357 		return;
358 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOC))
359 		printf("swpg_dealloc(%x)\n", pager);
360 #endif
361 	/*
362 	 * Remove from list right away so lookups will fail if we
363 	 * block for pageout completion.
364 	 */
365 	swp = (sw_pager_t) pager->pg_data;
366 	if (swp->sw_flags & SW_NAMED) {
367 		TAILQ_REMOVE(&swap_pager_list, pager, pg_list);
368 		swp->sw_flags &= ~SW_NAMED;
369 	}
370 #ifdef DEBUG
371 	for (swt = swtab; swt->st_osize; swt++)
372 		if (swp->sw_osize <= swt->st_osize)
373 			break;
374 	swt->st_inuse--;
375 #endif
376 
377 	/*
378 	 * Wait for all pageouts to finish and remove
379 	 * all entries from cleaning list.
380 	 */
381 	s = splbio();
382 	while (swp->sw_poip) {
383 		swp->sw_flags |= SW_WANTED;
384 		(void) tsleep(swp, PVM, "swpgdealloc", 0);
385 	}
386 	splx(s);
387 	swap_pager_clean(B_WRITE);
388 
389 	/*
390 	 * Free left over swap blocks
391 	 */
392 	for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++)
393 		if (bp->swb_block) {
394 #ifdef DEBUG
395 			if (swpagerdebug & (SDB_ALLOCBLK|SDB_FULL))
396 				printf("swpg_dealloc: blk %x\n",
397 				       bp->swb_block);
398 #endif
399 			rmfree(swapmap, swp->sw_bsize, bp->swb_block);
400 		}
401 	/*
402 	 * Free swap management resources
403 	 */
404 	free((caddr_t)swp->sw_blocks, M_VMPGDATA);
405 	free((caddr_t)swp, M_VMPGDATA);
406 	free((caddr_t)pager, M_VMPAGER);
407 }
408 
409 static int
410 swap_pager_getpage(pager, mlist, npages, sync)
411 	vm_pager_t pager;
412 	vm_page_t *mlist;
413 	int npages;
414 	boolean_t sync;
415 {
416 #ifdef DEBUG
417 	if (swpagerdebug & SDB_FOLLOW)
418 		printf("swpg_getpage(%x, %x, %x, %x)\n",
419 		       pager, mlist, npages, sync);
420 #endif
421 	return(swap_pager_io((sw_pager_t)pager->pg_data,
422 			     mlist, npages, B_READ));
423 }
424 
425 static int
426 swap_pager_putpage(pager, mlist, npages, sync)
427 	vm_pager_t pager;
428 	vm_page_t *mlist;
429 	int npages;
430 	boolean_t sync;
431 {
432 	int flags;
433 
434 #ifdef DEBUG
435 	if (swpagerdebug & SDB_FOLLOW)
436 		printf("swpg_putpage(%x, %x, %x, %x)\n",
437 		       pager, mlist, npages, sync);
438 #endif
439 	if (pager == NULL) {
440 		swap_pager_clean(B_WRITE);
441 		return (VM_PAGER_OK);		/* ??? */
442 	}
443 	flags = B_WRITE;
444 	if (!sync)
445 		flags |= B_ASYNC;
446 	return(swap_pager_io((sw_pager_t)pager->pg_data,
447 			     mlist, npages, flags));
448 }
449 
450 static boolean_t
451 swap_pager_haspage(pager, offset)
452 	vm_pager_t pager;
453 	vm_offset_t offset;
454 {
455 	register sw_pager_t swp;
456 	register sw_blk_t swb;
457 	int ix;
458 
459 #ifdef DEBUG
460 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOCBLK))
461 		printf("swpg_haspage(%x, %x) ", pager, offset);
462 #endif
463 	swp = (sw_pager_t) pager->pg_data;
464 	ix = offset / dbtob(swp->sw_bsize);
465 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
466 #ifdef DEBUG
467 		if (swpagerdebug & (SDB_FAIL|SDB_FOLLOW|SDB_ALLOCBLK))
468 			printf("swpg_haspage: %x bad offset %x, ix %x\n",
469 			       swp->sw_blocks, offset, ix);
470 #endif
471 		return(FALSE);
472 	}
473 	swb = &swp->sw_blocks[ix];
474 	if (swb->swb_block)
475 		ix = atop(offset % dbtob(swp->sw_bsize));
476 #ifdef DEBUG
477 	if (swpagerdebug & SDB_ALLOCBLK)
478 		printf("%x blk %x+%x ", swp->sw_blocks, swb->swb_block, ix);
479 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOCBLK))
480 		printf("-> %c\n",
481 		       "FT"[swb->swb_block && (swb->swb_mask & (1 << ix))]);
482 #endif
483 	if (swb->swb_block && (swb->swb_mask & (1 << ix)))
484 		return(TRUE);
485 	return(FALSE);
486 }
487 
488 static void
489 swap_pager_cluster(pager, offset, loffset, hoffset)
490 	vm_pager_t	pager;
491 	vm_offset_t	offset;
492 	vm_offset_t	*loffset;
493 	vm_offset_t	*hoffset;
494 {
495 	sw_pager_t swp;
496 	register int bsize;
497 	vm_offset_t loff, hoff;
498 
499 #ifdef DEBUG
500 	if (swpagerdebug & (SDB_FOLLOW|SDB_CLUSTER))
501 		printf("swpg_cluster(%x, %x) ", pager, offset);
502 #endif
503 	swp = (sw_pager_t) pager->pg_data;
504 	bsize = dbtob(swp->sw_bsize);
505 	if (bsize > swap_pager_maxcluster)
506 		bsize = swap_pager_maxcluster;
507 
508 	loff = offset - (offset % bsize);
509 	if (loff >= swp->sw_osize)
510 		panic("swap_pager_cluster: bad offset");
511 
512 	hoff = loff + bsize;
513 	if (hoff > swp->sw_osize)
514 		hoff = swp->sw_osize;
515 
516 	*loffset = loff;
517 	*hoffset = hoff;
518 #ifdef DEBUG
519 	if (swpagerdebug & (SDB_FOLLOW|SDB_CLUSTER))
520 		printf("returns [%x-%x]\n", loff, hoff);
521 #endif
522 }
523 
524 /*
525  * Scaled down version of swap().
526  * Assumes that PAGE_SIZE < MAXPHYS; i.e. only one operation needed.
527  * BOGUS:  lower level IO routines expect a KVA so we have to map our
528  * provided physical page into the KVA to keep them happy.
529  */
530 static int
531 swap_pager_io(swp, mlist, npages, flags)
532 	register sw_pager_t swp;
533 	vm_page_t *mlist;
534 	int npages;
535 	int flags;
536 {
537 	register struct buf *bp;
538 	register sw_blk_t swb;
539 	register int s;
540 	int ix, mask;
541 	boolean_t rv;
542 	vm_offset_t kva, off;
543 	swp_clean_t spc;
544 	vm_page_t m;
545 
546 #ifdef DEBUG
547 	/* save panic time state */
548 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
549 		return (VM_PAGER_FAIL);		/* XXX: correct return? */
550 	if (swpagerdebug & (SDB_FOLLOW|SDB_IO))
551 		printf("swpg_io(%x, %x, %x, %x)\n", swp, mlist, npages, flags);
552 	if (flags & B_READ) {
553 		if (flags & B_ASYNC)
554 			panic("swap_pager_io: cannot do ASYNC reads");
555 		if (npages != 1)
556 			panic("swap_pager_io: cannot do clustered reads");
557 	}
558 #endif
559 
560 	/*
561 	 * First determine if the page exists in the pager if this is
562 	 * a sync read.  This quickly handles cases where we are
563 	 * following shadow chains looking for the top level object
564 	 * with the page.
565 	 */
566 	m = *mlist;
567 	off = m->offset + m->object->paging_offset;
568 	ix = off / dbtob(swp->sw_bsize);
569 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
570 #ifdef DEBUG
571 		if ((flags & B_READ) == 0 && (swpagerdebug & SDB_ANOM)) {
572 			printf("swap_pager_io: no swap block on write\n");
573 			return(VM_PAGER_BAD);
574 		}
575 #endif
576 		return(VM_PAGER_FAIL);
577 	}
578 	swb = &swp->sw_blocks[ix];
579 	off = off % dbtob(swp->sw_bsize);
580 	if ((flags & B_READ) &&
581 	    (swb->swb_block == 0 || (swb->swb_mask & (1 << atop(off))) == 0))
582 		return(VM_PAGER_FAIL);
583 
584 	/*
585 	 * For reads (pageins) and synchronous writes, we clean up
586 	 * all completed async pageouts.
587 	 */
588 	if ((flags & B_ASYNC) == 0) {
589 		s = splbio();
590 		swap_pager_clean(flags&B_READ);
591 #ifdef DEBUG
592 		if (swpagerdebug & SDB_PARANOIA)
593 			swap_pager_clean_check(mlist, npages, flags&B_READ);
594 #endif
595 		splx(s);
596 	}
597 	/*
598 	 * For async writes (pageouts), we cleanup completed pageouts so
599 	 * that all available resources are freed.  Also tells us if this
600 	 * page is already being cleaned.  If it is, or no resources
601 	 * are available, we try again later.
602 	 */
603 	else {
604 		swap_pager_clean(B_WRITE);
605 #ifdef DEBUG
606 		if (swpagerdebug & SDB_PARANOIA)
607 			swap_pager_clean_check(mlist, npages, B_WRITE);
608 #endif
609 		if (swap_pager_free.tqh_first == NULL) {
610 #ifdef DEBUG
611 			if (swpagerdebug & SDB_FAIL)
612 				printf("%s: no available io headers\n",
613 				       "swap_pager_io");
614 #endif
615 			return(VM_PAGER_AGAIN);
616 		}
617 	}
618 
619 	/*
620 	 * Allocate a swap block if necessary.
621 	 */
622 	if (swb->swb_block == 0) {
623 		swb->swb_block = rmalloc(swapmap, swp->sw_bsize);
624 		if (swb->swb_block == 0) {
625 #ifdef DEBUG
626 			if (swpagerdebug & SDB_FAIL)
627 				printf("swpg_io: rmalloc of %x failed\n",
628 				       swp->sw_bsize);
629 #endif
630 			/*
631 			 * XXX this is technically a resource shortage that
632 			 * should return AGAIN, but the situation isn't likely
633 			 * to be remedied just by delaying a little while and
634 			 * trying again (the pageout daemon's current response
635 			 * to AGAIN) so we just return FAIL.
636 			 */
637 			return(VM_PAGER_FAIL);
638 		}
639 #ifdef DEBUG
640 		if (swpagerdebug & (SDB_FULL|SDB_ALLOCBLK))
641 			printf("swpg_io: %x alloc blk %x at ix %x\n",
642 			       swp->sw_blocks, swb->swb_block, ix);
643 #endif
644 	}
645 
646 	/*
647 	 * Allocate a kernel virtual address and initialize so that PTE
648 	 * is available for lower level IO drivers.
649 	 */
650 	kva = vm_pager_map_pages(mlist, npages, !(flags & B_ASYNC));
651 	if (kva == NULL) {
652 #ifdef DEBUG
653 		if (swpagerdebug & SDB_FAIL)
654 			printf("%s: no KVA space to map pages\n",
655 			       "swap_pager_io");
656 #endif
657 		return(VM_PAGER_AGAIN);
658 	}
659 
660 	/*
661 	 * Get a swap buffer header and initialize it.
662 	 */
663 	s = splbio();
664 	while (bswlist.b_actf == NULL) {
665 #ifdef DEBUG
666 		if (swpagerdebug & SDB_ANOM)
667 			printf("swap_pager_io: wait on swbuf for %x (%d)\n",
668 			       m, flags);
669 #endif
670 		bswlist.b_flags |= B_WANTED;
671 		tsleep((caddr_t)&bswlist, PSWP+1, "swpgiobuf", 0);
672 	}
673 	bp = bswlist.b_actf;
674 	bswlist.b_actf = bp->b_actf;
675 	splx(s);
676 	bp->b_flags = B_BUSY | (flags & B_READ);
677 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
678 	bp->b_data = (caddr_t)kva;
679 	bp->b_blkno = swb->swb_block + btodb(off);
680 	VHOLD(swapdev_vp);
681 	bp->b_vp = swapdev_vp;
682 	if (swapdev_vp->v_type == VBLK)
683 		bp->b_dev = swapdev_vp->v_rdev;
684 	bp->b_bcount = npages * PAGE_SIZE;
685 
686 	/*
687 	 * For writes we set up additional buffer fields, record a pageout
688 	 * in progress and mark that these swap blocks are now allocated.
689 	 */
690 	if ((bp->b_flags & B_READ) == 0) {
691 		bp->b_dirtyoff = 0;
692 		bp->b_dirtyend = npages * PAGE_SIZE;
693 		swapdev_vp->v_numoutput++;
694 		s = splbio();
695 		swp->sw_poip++;
696 		splx(s);
697 		mask = (~(~0 << npages)) << atop(off);
698 #ifdef DEBUG
699 		swap_pager_poip++;
700 		if (swpagerdebug & SDB_WRITE)
701 			printf("swpg_io: write: bp=%x swp=%x poip=%d\n",
702 			       bp, swp, swp->sw_poip);
703 		if ((swpagerdebug & SDB_ALLOCBLK) &&
704 		    (swb->swb_mask & mask) != mask)
705 			printf("swpg_io: %x write %d pages at %x+%x\n",
706 			       swp->sw_blocks, npages, swb->swb_block,
707 			       atop(off));
708 		if (swpagerdebug & SDB_CLUSTER)
709 			printf("swpg_io: off=%x, npg=%x, mask=%x, bmask=%x\n",
710 			       off, npages, mask, swb->swb_mask);
711 #endif
712 		swb->swb_mask |= mask;
713 	}
714 	/*
715 	 * If this is an async write we set up still more buffer fields
716 	 * and place a "cleaning" entry on the inuse queue.
717 	 */
718 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC) {
719 #ifdef DEBUG
720 		if (swap_pager_free.tqh_first == NULL)
721 			panic("swpg_io: lost spc");
722 #endif
723 		spc = swap_pager_free.tqh_first;
724 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
725 #ifdef DEBUG
726 		if (spc->spc_flags != SPC_FREE)
727 			panic("swpg_io: bad free spc");
728 #endif
729 		spc->spc_flags = SPC_BUSY;
730 		spc->spc_bp = bp;
731 		spc->spc_swp = swp;
732 		spc->spc_kva = kva;
733 		/*
734 		 * Record the first page.  This allows swap_pager_clean
735 		 * to efficiently handle the common case of a single page.
736 		 * For clusters, it allows us to locate the object easily
737 		 * and we then reconstruct the rest of the mlist from spc_kva.
738 		 */
739 		spc->spc_m = m;
740 		spc->spc_npages = npages;
741 		bp->b_flags |= B_CALL;
742 		bp->b_iodone = swap_pager_iodone;
743 		s = splbio();
744 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
745 		splx(s);
746 	}
747 
748 	/*
749 	 * Finally, start the IO operation.
750 	 * If it is async we are all done, otherwise we must wait for
751 	 * completion and cleanup afterwards.
752 	 */
753 #ifdef DEBUG
754 	if (swpagerdebug & SDB_IO)
755 		printf("swpg_io: IO start: bp %x, db %x, va %x, pa %x\n",
756 		       bp, swb->swb_block+btodb(off), kva, VM_PAGE_TO_PHYS(m));
757 #endif
758 	VOP_STRATEGY(bp);
759 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC) {
760 #ifdef DEBUG
761 		if (swpagerdebug & SDB_IO)
762 			printf("swpg_io:  IO started: bp %x\n", bp);
763 #endif
764 		return(VM_PAGER_PEND);
765 	}
766 	s = splbio();
767 #ifdef DEBUG
768 	if (flags & B_READ)
769 		swap_pager_piip++;
770 	else
771 		swap_pager_poip++;
772 #endif
773 	while ((bp->b_flags & B_DONE) == 0)
774 		(void) tsleep(bp, PVM, "swpgio", 0);
775 	if ((flags & B_READ) == 0)
776 		--swp->sw_poip;
777 #ifdef DEBUG
778 	if (flags & B_READ)
779 		--swap_pager_piip;
780 	else
781 		--swap_pager_poip;
782 #endif
783 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
784 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
785 	bp->b_actf = bswlist.b_actf;
786 	bswlist.b_actf = bp;
787 	if (bp->b_vp)
788 		brelvp(bp);
789 	if (bswlist.b_flags & B_WANTED) {
790 		bswlist.b_flags &= ~B_WANTED;
791 		wakeup(&bswlist);
792 	}
793 	if ((flags & B_READ) == 0 && rv == VM_PAGER_OK) {
794 		m->flags |= PG_CLEAN;
795 		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
796 	}
797 	splx(s);
798 #ifdef DEBUG
799 	if (swpagerdebug & SDB_IO)
800 		printf("swpg_io:  IO done: bp %x, rv %d\n", bp, rv);
801 	if ((swpagerdebug & SDB_FAIL) && rv == VM_PAGER_ERROR)
802 		printf("swpg_io: IO error\n");
803 #endif
804 	vm_pager_unmap_pages(kva, npages);
805 	return(rv);
806 }
807 
808 static void
809 swap_pager_clean(rw)
810 	int rw;
811 {
812 	register swp_clean_t spc;
813 	register int s, i;
814 	vm_object_t object;
815 	vm_page_t m;
816 
817 #ifdef DEBUG
818 	/* save panic time state */
819 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
820 		return;
821 	if (swpagerdebug & SDB_FOLLOW)
822 		printf("swpg_clean(%x)\n", rw);
823 #endif
824 
825 	for (;;) {
826 		/*
827 		 * Look up and removal from inuse list must be done
828 		 * at splbio() to avoid conflicts with swap_pager_iodone.
829 		 */
830 		s = splbio();
831 		for (spc = swap_pager_inuse.tqh_first;
832 		     spc != NULL;
833 		     spc = spc->spc_list.tqe_next) {
834 			/*
835 			 * If the operation is done, remove it from the
836 			 * list and process it.
837 			 *
838 			 * XXX if we can't get the object lock we also
839 			 * leave it on the list and try again later.
840 			 * Is there something better we could do?
841 			 */
842 			if ((spc->spc_flags & SPC_DONE) &&
843 			    vm_object_lock_try(spc->spc_m->object)) {
844 				TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
845 				break;
846 			}
847 		}
848 		splx(s);
849 
850 		/*
851 		 * No operations done, thats all we can do for now.
852 		 */
853 		if (spc == NULL)
854 			break;
855 
856 		/*
857 		 * Found a completed operation so finish it off.
858 		 * Note: no longer at splbio since entry is off the list.
859 		 */
860 		m = spc->spc_m;
861 		object = m->object;
862 
863 		/*
864 		 * Process each page in the cluster.
865 		 * The first page is explicitly kept in the cleaning
866 		 * entry, others must be reconstructed from the KVA.
867 		 */
868 		for (i = 0; i < spc->spc_npages; i++) {
869 			if (i)
870 				m = vm_pager_atop(spc->spc_kva + ptoa(i));
871 			/*
872 			 * If no error mark as clean and inform the pmap
873 			 * system.  If there was an error, mark as dirty
874 			 * so we will try again.
875 			 *
876 			 * XXX could get stuck doing this, should give up
877 			 * after awhile.
878 			 */
879 			if (spc->spc_flags & SPC_ERROR) {
880 				printf("%s: clean of page %x failed\n",
881 				       "swap_pager_clean",
882 				       VM_PAGE_TO_PHYS(m));
883 				m->flags |= PG_LAUNDRY;
884 			} else {
885 				m->flags |= PG_CLEAN;
886 				pmap_clear_modify(VM_PAGE_TO_PHYS(m));
887 			}
888 			m->flags &= ~PG_BUSY;
889 			PAGE_WAKEUP(m);
890 		}
891 
892 		/*
893 		 * Done with the object, decrement the paging count
894 		 * and unlock it.
895 		 */
896 		if (--object->paging_in_progress == 0)
897 			wakeup(object);
898 		vm_object_unlock(object);
899 
900 		/*
901 		 * Free up KVM used and put the entry back on the list.
902 		 */
903 		vm_pager_unmap_pages(spc->spc_kva, spc->spc_npages);
904 		spc->spc_flags = SPC_FREE;
905 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
906 #ifdef DEBUG
907 		if (swpagerdebug & SDB_WRITE)
908 			printf("swpg_clean: free spc %x\n", spc);
909 #endif
910 	}
911 }
912 
913 #ifdef DEBUG
914 static void
915 swap_pager_clean_check(mlist, npages, rw)
916 	vm_page_t *mlist;
917 	int npages;
918 	int rw;
919 {
920 	register swp_clean_t spc;
921 	boolean_t bad;
922 	int i, j, s;
923 	vm_page_t m;
924 
925 	if (panicstr)
926 		return;
927 
928 	bad = FALSE;
929 	s = splbio();
930 	for (spc = swap_pager_inuse.tqh_first;
931 	     spc != NULL;
932 	     spc = spc->spc_list.tqe_next) {
933 		for (j = 0; j < spc->spc_npages; j++) {
934 			m = vm_pager_atop(spc->spc_kva + ptoa(j));
935 			for (i = 0; i < npages; i++)
936 				if (m == mlist[i]) {
937 					if (swpagerdebug & SDB_ANOM)
938 						printf(
939 		"swpg_clean_check: %s: page %x on list, flags %x\n",
940 		rw == B_WRITE ? "write" : "read", mlist[i], spc->spc_flags);
941 					bad = TRUE;
942 				}
943 		}
944 	}
945 	splx(s);
946 	if (bad)
947 		panic("swpg_clean_check");
948 }
949 #endif
950 
951 static void
952 swap_pager_iodone(bp)
953 	register struct buf *bp;
954 {
955 	register swp_clean_t spc;
956 	daddr_t blk;
957 	int s;
958 
959 #ifdef DEBUG
960 	/* save panic time state */
961 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
962 		return;
963 	if (swpagerdebug & SDB_FOLLOW)
964 		printf("swpg_iodone(%x)\n", bp);
965 #endif
966 	s = splbio();
967 	for (spc = swap_pager_inuse.tqh_first;
968 	     spc != NULL;
969 	     spc = spc->spc_list.tqe_next)
970 		if (spc->spc_bp == bp)
971 			break;
972 #ifdef DEBUG
973 	if (spc == NULL)
974 		panic("swap_pager_iodone: bp not found");
975 #endif
976 
977 	spc->spc_flags &= ~SPC_BUSY;
978 	spc->spc_flags |= SPC_DONE;
979 	if (bp->b_flags & B_ERROR)
980 		spc->spc_flags |= SPC_ERROR;
981 	spc->spc_bp = NULL;
982 	blk = bp->b_blkno;
983 
984 #ifdef DEBUG
985 	--swap_pager_poip;
986 	if (swpagerdebug & SDB_WRITE)
987 		printf("swpg_iodone: bp=%x swp=%x flags=%x spc=%x poip=%x\n",
988 		       bp, spc->spc_swp, spc->spc_swp->sw_flags,
989 		       spc, spc->spc_swp->sw_poip);
990 #endif
991 
992 	spc->spc_swp->sw_poip--;
993 	if (spc->spc_swp->sw_flags & SW_WANTED) {
994 		spc->spc_swp->sw_flags &= ~SW_WANTED;
995 		wakeup(spc->spc_swp);
996 	}
997 
998 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
999 	bp->b_actf = bswlist.b_actf;
1000 	bswlist.b_actf = bp;
1001 	if (bp->b_vp)
1002 		brelvp(bp);
1003 	if (bswlist.b_flags & B_WANTED) {
1004 		bswlist.b_flags &= ~B_WANTED;
1005 		wakeup(&bswlist);
1006 	}
1007 	wakeup(&vm_pages_needed);
1008 	splx(s);
1009 }
1010