xref: /freebsd/sys/kern/vfs_aio.c (revision 17d6c636720d00f77e5d098daf4c278f89d84f7b)
1 /*
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  *
16  * $FreeBSD$
17  */
18 
19 /*
20  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21  */
22 
23 #include <sys/param.h>
24 #include <sys/systm.h>
25 #include <sys/bio.h>
26 #include <sys/buf.h>
27 #include <sys/sysproto.h>
28 #include <sys/filedesc.h>
29 #include <sys/kernel.h>
30 #include <sys/kthread.h>
31 #include <sys/fcntl.h>
32 #include <sys/file.h>
33 #include <sys/lock.h>
34 #include <sys/mutex.h>
35 #include <sys/unistd.h>
36 #include <sys/proc.h>
37 #include <sys/resourcevar.h>
38 #include <sys/signalvar.h>
39 #include <sys/protosw.h>
40 #include <sys/socketvar.h>
41 #include <sys/syscall.h>
42 #include <sys/sysent.h>
43 #include <sys/sysctl.h>
44 #include <sys/vnode.h>
45 #include <sys/conf.h>
46 #include <sys/event.h>
47 
48 #include <vm/vm.h>
49 #include <vm/vm_extern.h>
50 #include <vm/pmap.h>
51 #include <vm/vm_map.h>
52 #include <vm/vm_zone.h>
53 #include <sys/aio.h>
54 
55 #include <machine/limits.h>
56 
57 #include "opt_vfs_aio.h"
58 
59 static	long jobrefid;
60 
61 #define JOBST_NULL		0x0
62 #define	JOBST_JOBQPROC		0x1
63 #define JOBST_JOBQGLOBAL	0x2
64 #define JOBST_JOBRUNNING	0x3
65 #define JOBST_JOBFINISHED	0x4
66 #define	JOBST_JOBQBUF		0x5
67 #define	JOBST_JOBBFINISHED	0x6
68 
69 #ifndef MAX_AIO_PER_PROC
70 #define MAX_AIO_PER_PROC	32
71 #endif
72 
73 #ifndef MAX_AIO_QUEUE_PER_PROC
74 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
75 #endif
76 
77 #ifndef MAX_AIO_PROCS
78 #define MAX_AIO_PROCS		32
79 #endif
80 
81 #ifndef MAX_AIO_QUEUE
82 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
83 #endif
84 
85 #ifndef TARGET_AIO_PROCS
86 #define TARGET_AIO_PROCS	4
87 #endif
88 
89 #ifndef MAX_BUF_AIO
90 #define MAX_BUF_AIO		16
91 #endif
92 
93 #ifndef AIOD_TIMEOUT_DEFAULT
94 #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
95 #endif
96 
97 #ifndef AIOD_LIFETIME_DEFAULT
98 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
99 #endif
100 
101 static int max_aio_procs = MAX_AIO_PROCS;
102 static int num_aio_procs = 0;
103 static int target_aio_procs = TARGET_AIO_PROCS;
104 static int max_queue_count = MAX_AIO_QUEUE;
105 static int num_queue_count = 0;
106 static int num_buf_aio = 0;
107 static int num_aio_resv_start = 0;
108 static int aiod_timeout;
109 static int aiod_lifetime;
110 static int unloadable = 0;
111 
112 static int max_aio_per_proc = MAX_AIO_PER_PROC;
113 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
114 static int max_buf_aio = MAX_BUF_AIO;
115 
116 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
117 
118 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
119 	CTLFLAG_RW, &max_aio_per_proc, 0, "");
120 
121 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
122 	CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
123 
124 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
125 	CTLFLAG_RW, &max_aio_procs, 0, "");
126 
127 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
128 	CTLFLAG_RD, &num_aio_procs, 0, "");
129 
130 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
131 	CTLFLAG_RD, &num_queue_count, 0, "");
132 
133 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
134 	CTLFLAG_RW, &max_queue_count, 0, "");
135 
136 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
137 	CTLFLAG_RW, &target_aio_procs, 0, "");
138 
139 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
140 	CTLFLAG_RW, &max_buf_aio, 0, "");
141 
142 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
143 	CTLFLAG_RD, &num_buf_aio, 0, "");
144 
145 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
146 	CTLFLAG_RW, &aiod_lifetime, 0, "");
147 
148 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
149 	CTLFLAG_RW, &aiod_timeout, 0, "");
150 
151 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
152     "Allow unload of aio (not recommended)");
153 
154 struct aiocblist {
155         TAILQ_ENTRY(aiocblist) list;	/* List of jobs */
156         TAILQ_ENTRY(aiocblist) plist;	/* List of jobs for proc */
157         int	jobflags;
158         int	jobstate;
159 	int	inputcharge;
160 	int	outputcharge;
161 	struct	callout_handle timeouthandle;
162         struct	buf *bp;		/* Buffer pointer */
163         struct	proc *userproc;		/* User process */ /* Not td! */
164         struct	file *fd_file;		/* Pointer to file structure */
165 	struct	aiothreadlist *jobaiothread;  /* AIO process descriptor */
166         struct	aio_liojob *lio;	/* Optional lio job */
167         struct	aiocb *uuaiocb;		/* Pointer in userspace of aiocb */
168 	struct	klist klist;		/* list of knotes */
169         struct	aiocb uaiocb;		/* Kernel I/O control block */
170 };
171 
172 /* jobflags */
173 #define AIOCBLIST_RUNDOWN       0x4
174 #define AIOCBLIST_ASYNCFREE     0x8
175 #define AIOCBLIST_DONE          0x10
176 
177 /*
178  * AIO process info
179  */
180 #define AIOP_FREE	0x1			/* proc on free queue */
181 #define AIOP_SCHED	0x2			/* proc explicitly scheduled */
182 
183 struct aiothreadlist {
184 	int aiothreadflags;			/* AIO proc flags */
185 	TAILQ_ENTRY(aiothreadlist) list;	/* List of processes */
186 	struct thread *aiothread;		/* The AIO thread */
187 	TAILQ_HEAD(,aiocblist) jobtorun;	/* suggested job to run */
188 };
189 
190 /*
191  * data-structure for lio signal management
192  */
193 struct aio_liojob {
194 	int	lioj_flags;
195 	int	lioj_buffer_count;
196 	int	lioj_buffer_finished_count;
197 	int	lioj_queue_count;
198 	int	lioj_queue_finished_count;
199 	struct	sigevent lioj_signal;	/* signal on all I/O done */
200 	TAILQ_ENTRY(aio_liojob) lioj_list;
201 	struct	kaioinfo *lioj_ki;
202 };
203 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
204 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
205 
206 /*
207  * per process aio data structure
208  */
209 struct kaioinfo {
210 	int	kaio_flags;		/* per process kaio flags */
211 	int	kaio_maxactive_count;	/* maximum number of AIOs */
212 	int	kaio_active_count;	/* number of currently used AIOs */
213 	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
214 	int	kaio_queue_count;	/* size of AIO queue */
215 	int	kaio_ballowed_count;	/* maximum number of buffers */
216 	int	kaio_queue_finished_count; /* number of daemon jobs finished */
217 	int	kaio_buffer_count;	/* number of physio buffers */
218 	int	kaio_buffer_finished_count; /* count of I/O done */
219 	struct 	proc *kaio_p;		/* process that uses this kaio block */
220 	TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
221 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* job queue for process */
222 	TAILQ_HEAD(,aiocblist) kaio_jobdone;	/* done queue for process */
223 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* buffer job queue for process */
224 	TAILQ_HEAD(,aiocblist) kaio_bufdone;	/* buffer done queue for process */
225 	TAILQ_HEAD(,aiocblist) kaio_sockqueue;	/* queue for aios waiting on sockets */
226 };
227 
228 #define KAIO_RUNDOWN	0x1	/* process is being run down */
229 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
230 
231 static TAILQ_HEAD(,aiothreadlist) aio_freeproc, aio_activeproc;
232 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
233 static TAILQ_HEAD(,aiocblist) aio_bufjobs;		/* Phys I/O job list */
234 
235 static void	aio_init_aioinfo(struct proc *p);
236 static void	aio_onceonly(void);
237 static int	aio_free_entry(struct aiocblist *aiocbe);
238 static void	aio_process(struct aiocblist *aiocbe);
239 static int	aio_newproc(void);
240 static int	aio_aqueue(struct thread *td, struct aiocb *job, int type);
241 static void	aio_physwakeup(struct buf *bp);
242 static void	aio_proc_rundown(struct proc *p);
243 static int	aio_fphysio(struct proc *p, struct aiocblist *aiocbe);
244 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
245 static void	aio_daemon(void *uproc);
246 static void	aio_swake_cb(struct socket *, struct sockbuf *);
247 static int	aio_unload(void);
248 static void	process_signal(void *aioj);
249 static int	filt_aioattach(struct knote *kn);
250 static void	filt_aiodetach(struct knote *kn);
251 static int	filt_aio(struct knote *kn, long hint);
252 
253 static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone;
254 static vm_zone_t aiolio_zone;
255 
256 static struct filterops aio_filtops =
257 	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
258 
259 static int
260 aio_modload(struct module *module, int cmd, void *arg)
261 {
262 	int error = 0;
263 
264 	switch (cmd) {
265 	case MOD_LOAD:
266 		aio_onceonly();
267 		break;
268 	case MOD_UNLOAD:
269 		error = aio_unload();
270 		break;
271 	case MOD_SHUTDOWN:
272 		break;
273 	default:
274 		error = EINVAL;
275 		break;
276 	}
277 	return (error);
278 }
279 
280 static moduledata_t aio_mod = {
281 	"aio",
282 	&aio_modload,
283 	NULL
284 };
285 
286 SYSCALL_MODULE_HELPER(aio_return);
287 SYSCALL_MODULE_HELPER(aio_suspend);
288 SYSCALL_MODULE_HELPER(aio_cancel);
289 SYSCALL_MODULE_HELPER(aio_error);
290 SYSCALL_MODULE_HELPER(aio_read);
291 SYSCALL_MODULE_HELPER(aio_write);
292 SYSCALL_MODULE_HELPER(aio_waitcomplete);
293 SYSCALL_MODULE_HELPER(lio_listio);
294 
295 DECLARE_MODULE(aio, aio_mod,
296 	SI_SUB_VFS, SI_ORDER_ANY);
297 MODULE_VERSION(aio, 1);
298 
299 /*
300  * Startup initialization
301  */
302 static void
303 aio_onceonly(void)
304 {
305 
306 	/* XXX: should probably just use so->callback */
307 	aio_swake = &aio_swake_cb;
308 	at_exit(aio_proc_rundown);
309 	at_exec(aio_proc_rundown);
310 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
311 	TAILQ_INIT(&aio_freeproc);
312 	TAILQ_INIT(&aio_activeproc);
313 	TAILQ_INIT(&aio_jobs);
314 	TAILQ_INIT(&aio_bufjobs);
315 	kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1);
316 	aiop_zone = zinit("AIOP", sizeof(struct aiothreadlist), 0, 0, 1);
317 	aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1);
318 	aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1);
319 	aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof(struct
320 	    aio_liojob), 0, 0, 1);
321 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
322 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
323 	jobrefid = 1;
324 }
325 
326 static int
327 aio_unload(void)
328 {
329 
330 	/*
331 	 * XXX: no unloads by default, it's too dangerous.
332 	 * perhaps we could do it if locked out callers and then
333 	 * did an aio_proc_rundown() on each process.
334 	 */
335 	if (!unloadable)
336 		return (EOPNOTSUPP);
337 
338 	aio_swake = NULL;
339 	rm_at_exit(aio_proc_rundown);
340 	rm_at_exec(aio_proc_rundown);
341 	kqueue_del_filteropts(EVFILT_AIO);
342 	return (0);
343 }
344 
345 /*
346  * Init the per-process aioinfo structure.  The aioinfo limits are set
347  * per-process for user limit (resource) management.
348  */
349 static void
350 aio_init_aioinfo(struct proc *p)
351 {
352 	struct kaioinfo *ki;
353 	if (p->p_aioinfo == NULL) {
354 		ki = zalloc(kaio_zone);
355 		p->p_aioinfo = ki;
356 		ki->kaio_flags = 0;
357 		ki->kaio_maxactive_count = max_aio_per_proc;
358 		ki->kaio_active_count = 0;
359 		ki->kaio_qallowed_count = max_aio_queue_per_proc;
360 		ki->kaio_queue_count = 0;
361 		ki->kaio_ballowed_count = max_buf_aio;
362 		ki->kaio_buffer_count = 0;
363 		ki->kaio_buffer_finished_count = 0;
364 		ki->kaio_p = p;
365 		TAILQ_INIT(&ki->kaio_jobdone);
366 		TAILQ_INIT(&ki->kaio_jobqueue);
367 		TAILQ_INIT(&ki->kaio_bufdone);
368 		TAILQ_INIT(&ki->kaio_bufqueue);
369 		TAILQ_INIT(&ki->kaio_liojoblist);
370 		TAILQ_INIT(&ki->kaio_sockqueue);
371 	}
372 
373 	while (num_aio_procs < target_aio_procs)
374 		aio_newproc();
375 }
376 
377 /*
378  * Free a job entry.  Wait for completion if it is currently active, but don't
379  * delay forever.  If we delay, we return a flag that says that we have to
380  * restart the queue scan.
381  */
382 static int
383 aio_free_entry(struct aiocblist *aiocbe)
384 {
385 	struct kaioinfo *ki;
386 	struct aiothreadlist *aiop;
387 	struct aio_liojob *lj;
388 	struct proc *p;
389 	int error;
390 	int s;
391 
392 	if (aiocbe->jobstate == JOBST_NULL)
393 		panic("aio_free_entry: freeing already free job");
394 
395 	p = aiocbe->userproc;
396 	ki = p->p_aioinfo;
397 	lj = aiocbe->lio;
398 	if (ki == NULL)
399 		panic("aio_free_entry: missing p->p_aioinfo");
400 
401 	while (aiocbe->jobstate == JOBST_JOBRUNNING) {
402 		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
403 			return 0;
404 		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
405 		tsleep(aiocbe, PRIBIO, "jobwai", 0);
406 	}
407 	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
408 
409 	if (aiocbe->bp == NULL) {
410 		if (ki->kaio_queue_count <= 0)
411 			panic("aio_free_entry: process queue size <= 0");
412 		if (num_queue_count <= 0)
413 			panic("aio_free_entry: system wide queue size <= 0");
414 
415 		if (lj) {
416 			lj->lioj_queue_count--;
417 			if (aiocbe->jobflags & AIOCBLIST_DONE)
418 				lj->lioj_queue_finished_count--;
419 		}
420 		ki->kaio_queue_count--;
421 		if (aiocbe->jobflags & AIOCBLIST_DONE)
422 			ki->kaio_queue_finished_count--;
423 		num_queue_count--;
424 	} else {
425 		if (lj) {
426 			lj->lioj_buffer_count--;
427 			if (aiocbe->jobflags & AIOCBLIST_DONE)
428 				lj->lioj_buffer_finished_count--;
429 		}
430 		if (aiocbe->jobflags & AIOCBLIST_DONE)
431 			ki->kaio_buffer_finished_count--;
432 		ki->kaio_buffer_count--;
433 		num_buf_aio--;
434 	}
435 
436 	/* aiocbe is going away, we need to destroy any knotes */
437 	knote_remove(&p->p_thread, &aiocbe->klist); /* XXXKSE */
438 	/* XXXKSE Note the thread here is used to eventually find the
439 	 * owning process again, but it is also used to do a fo_close
440 	 * and that requires the thread. (but does it require the
441 	 * OWNING thread? (or maby the running thread?)
442 	 * There is a semantic problem here...
443 	 */
444 
445 	if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
446 	    && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
447 		ki->kaio_flags &= ~KAIO_WAKEUP;
448 		wakeup(p);
449 	}
450 
451 	if (aiocbe->jobstate == JOBST_JOBQBUF) {
452 		if ((error = aio_fphysio(p, aiocbe)) != 0)
453 			return error;
454 		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
455 			panic("aio_free_entry: invalid physio finish-up state");
456 		s = splbio();
457 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
458 		splx(s);
459 	} else if (aiocbe->jobstate == JOBST_JOBQPROC) {
460 		aiop = aiocbe->jobaiothread;
461 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
462 	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
463 		s = splnet();
464 		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
465 		TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
466 		splx(s);
467 	} else if (aiocbe->jobstate == JOBST_JOBFINISHED)
468 		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
469 	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
470 		s = splbio();
471 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
472 		splx(s);
473 		if (aiocbe->bp) {
474 			vunmapbuf(aiocbe->bp);
475 			relpbuf(aiocbe->bp, NULL);
476 			aiocbe->bp = NULL;
477 		}
478 	}
479 	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
480 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
481 		zfree(aiolio_zone, lj);
482 	}
483 	aiocbe->jobstate = JOBST_NULL;
484 	untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
485 	zfree(aiocb_zone, aiocbe);
486 	return 0;
487 }
488 
489 /*
490  * Rundown the jobs for a given process.
491  */
492 static void
493 aio_proc_rundown(struct proc *p)
494 {
495 	int s;
496 	struct kaioinfo *ki;
497 	struct aio_liojob *lj, *ljn;
498 	struct aiocblist *aiocbe, *aiocbn;
499 	struct file *fp;
500 	struct filedesc *fdp;
501 	struct socket *so;
502 
503 	ki = p->p_aioinfo;
504 	if (ki == NULL)
505 		return;
506 
507 	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
508 	while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
509 	    ki->kaio_buffer_finished_count)) {
510 		ki->kaio_flags |= KAIO_RUNDOWN;
511 		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
512 			break;
513 	}
514 
515 	/*
516 	 * Move any aio ops that are waiting on socket I/O to the normal job
517 	 * queues so they are cleaned up with any others.
518 	 */
519 	fdp = p->p_fd;
520 
521 	s = splnet();
522 	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
523 	    aiocbn) {
524 		aiocbn = TAILQ_NEXT(aiocbe, plist);
525 		fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes];
526 
527 		/*
528 		 * Under some circumstances, the aio_fildes and the file
529 		 * structure don't match.  This would leave aiocbe's in the
530 		 * TAILQ associated with the socket and cause a panic later.
531 		 *
532 		 * Detect and fix.
533 		 */
534 		if ((fp == NULL) || (fp != aiocbe->fd_file))
535 			fp = aiocbe->fd_file;
536 		if (fp) {
537 			so = (struct socket *)fp->f_data;
538 			TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
539 			if (TAILQ_EMPTY(&so->so_aiojobq)) {
540 				so->so_snd.sb_flags &= ~SB_AIO;
541 				so->so_rcv.sb_flags &= ~SB_AIO;
542 			}
543 		}
544 		TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
545 		TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
546 		TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
547 	}
548 	splx(s);
549 
550 restart1:
551 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
552 		aiocbn = TAILQ_NEXT(aiocbe, plist);
553 		if (aio_free_entry(aiocbe))
554 			goto restart1;
555 	}
556 
557 restart2:
558 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
559 	    aiocbn) {
560 		aiocbn = TAILQ_NEXT(aiocbe, plist);
561 		if (aio_free_entry(aiocbe))
562 			goto restart2;
563 	}
564 
565 /*
566  * Note the use of lots of splbio here, trying to avoid splbio for long chains
567  * of I/O.  Probably unnecessary.
568  */
569 restart3:
570 	s = splbio();
571 	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
572 		ki->kaio_flags |= KAIO_WAKEUP;
573 		tsleep(p, PRIBIO, "aioprn", 0);
574 		splx(s);
575 		goto restart3;
576 	}
577 	splx(s);
578 
579 restart4:
580 	s = splbio();
581 	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
582 		aiocbn = TAILQ_NEXT(aiocbe, plist);
583 		if (aio_free_entry(aiocbe)) {
584 			splx(s);
585 			goto restart4;
586 		}
587 	}
588 	splx(s);
589 
590         /*
591          * If we've slept, jobs might have moved from one queue to another.
592          * Retry rundown if we didn't manage to empty the queues.
593          */
594         if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
595 	    TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
596 	    TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
597 	    TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
598 		goto restart1;
599 
600 	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
601 		ljn = TAILQ_NEXT(lj, lioj_list);
602 		if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
603 		    0)) {
604 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
605 			zfree(aiolio_zone, lj);
606 		} else {
607 #ifdef DIAGNOSTIC
608 			printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
609 			    "QF:%d\n", lj->lioj_buffer_count,
610 			    lj->lioj_buffer_finished_count,
611 			    lj->lioj_queue_count,
612 			    lj->lioj_queue_finished_count);
613 #endif
614 		}
615 	}
616 
617 	zfree(kaio_zone, ki);
618 	p->p_aioinfo = NULL;
619 }
620 
621 /*
622  * Select a job to run (called by an AIO daemon).
623  */
624 static struct aiocblist *
625 aio_selectjob(struct aiothreadlist *aiop)
626 {
627 	int s;
628 	struct aiocblist *aiocbe;
629 	struct kaioinfo *ki;
630 	struct proc *userp;
631 
632 	aiocbe = TAILQ_FIRST(&aiop->jobtorun);
633 	if (aiocbe) {
634 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
635 		return aiocbe;
636 	}
637 
638 	s = splnet();
639 	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
640 	    TAILQ_NEXT(aiocbe, list)) {
641 		userp = aiocbe->userproc;
642 		ki = userp->p_aioinfo;
643 
644 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
645 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
646 			splx(s);
647 			return aiocbe;
648 		}
649 	}
650 	splx(s);
651 
652 	return NULL;
653 }
654 
655 /*
656  * The AIO processing activity.  This is the code that does the I/O request for
657  * the non-physio version of the operations.  The normal vn operations are used,
658  * and this code should work in all instances for every type of file, including
659  * pipes, sockets, fifos, and regular files.
660  */
661 static void
662 aio_process(struct aiocblist *aiocbe)
663 {
664 	struct filedesc *fdp;
665 	struct thread *td;
666 	struct proc *userp;
667 	struct proc *mycp;
668 	struct aiocb *cb;
669 	struct file *fp;
670 	struct uio auio;
671 	struct iovec aiov;
672 	unsigned int fd;
673 	int cnt;
674 	int error;
675 	off_t offset;
676 	int oublock_st, oublock_end;
677 	int inblock_st, inblock_end;
678 
679 	userp = aiocbe->userproc;
680 	td = curthread;
681 	mycp = td->td_proc;
682 	cb = &aiocbe->uaiocb;
683 
684 	fdp = mycp->p_fd;
685 	fd = cb->aio_fildes;
686 	fp = fdp->fd_ofiles[fd];
687 
688 	if ((fp == NULL) || (fp != aiocbe->fd_file)) {
689 		cb->_aiocb_private.error = EBADF;
690 		cb->_aiocb_private.status = -1;
691 		return;
692 	}
693 
694 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
695 	aiov.iov_len = cb->aio_nbytes;
696 
697 	auio.uio_iov = &aiov;
698 	auio.uio_iovcnt = 1;
699 	auio.uio_offset = offset = cb->aio_offset;
700 	auio.uio_resid = cb->aio_nbytes;
701 	cnt = cb->aio_nbytes;
702 	auio.uio_segflg = UIO_USERSPACE;
703 	auio.uio_td = td;
704 
705 	inblock_st = mycp->p_stats->p_ru.ru_inblock;
706 	oublock_st = mycp->p_stats->p_ru.ru_oublock;
707 	/*
708 	 * Temporarily bump the ref count while reading to avoid the
709 	 * descriptor being ripped out from under us.
710 	 */
711 	fhold(fp);
712 	if (cb->aio_lio_opcode == LIO_READ) {
713 		auio.uio_rw = UIO_READ;
714 		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
715 	} else {
716 		auio.uio_rw = UIO_WRITE;
717 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
718 	}
719 	fdrop(fp, td);
720 	inblock_end = mycp->p_stats->p_ru.ru_inblock;
721 	oublock_end = mycp->p_stats->p_ru.ru_oublock;
722 
723 	aiocbe->inputcharge = inblock_end - inblock_st;
724 	aiocbe->outputcharge = oublock_end - oublock_st;
725 
726 	if ((error) && (auio.uio_resid != cnt)) {
727 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
728 			error = 0;
729 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
730 			PROC_LOCK(userp);
731 			psignal(userp, SIGPIPE);
732 			PROC_UNLOCK(userp);
733 		}
734 	}
735 
736 	cnt -= auio.uio_resid;
737 	cb->_aiocb_private.error = error;
738 	cb->_aiocb_private.status = cnt;
739 }
740 
741 /*
742  * The AIO daemon, most of the actual work is done in aio_process,
743  * but the setup (and address space mgmt) is done in this routine.
744  */
745 static void
746 aio_daemon(void *uproc)
747 {
748 	int s;
749 	struct aio_liojob *lj;
750 	struct aiocb *cb;
751 	struct aiocblist *aiocbe;
752 	struct aiothreadlist *aiop;
753 	struct kaioinfo *ki;
754 	struct proc *curcp, *mycp, *userp;
755 	struct vmspace *myvm, *tmpvm;
756 	struct thread *td = curthread;
757 
758 	mtx_lock(&Giant);
759 	/*
760 	 * Local copies of curproc (cp) and vmspace (myvm)
761 	 */
762 	mycp = td->td_proc;
763 	myvm = mycp->p_vmspace;
764 
765 	if (mycp->p_textvp) {
766 		vrele(mycp->p_textvp);
767 		mycp->p_textvp = NULL;
768 	}
769 
770 	/*
771 	 * Allocate and ready the aio control info.  There is one aiop structure
772 	 * per daemon.
773 	 */
774 	aiop = zalloc(aiop_zone);
775 	aiop->aiothread = td;
776 	aiop->aiothreadflags |= AIOP_FREE;
777 	TAILQ_INIT(&aiop->jobtorun);
778 
779 	s = splnet();
780 
781 	/*
782 	 * Place thread (lightweight process) onto the AIO free thread list.
783 	 */
784 	if (TAILQ_EMPTY(&aio_freeproc))
785 		wakeup(&aio_freeproc);
786 	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
787 
788 	splx(s);
789 
790 	/*
791 	 * Get rid of our current filedescriptors.  AIOD's don't need any
792 	 * filedescriptors, except as temporarily inherited from the client.
793 	 */
794 	fdfree(td);
795 	mycp->p_fd = NULL;
796 
797 	/* The daemon resides in its own pgrp. */
798 	enterpgrp(mycp, mycp->p_pid, 1);
799 
800 	/* Mark special process type. */
801 	mycp->p_flag |= P_SYSTEM;
802 
803 	/*
804 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
805 	 * and creating too many daemons.)
806 	 */
807 	wakeup(mycp);
808 
809 	for (;;) {
810 		/*
811 		 * curcp is the current daemon process context.
812 		 * userp is the current user process context.
813 		 */
814 		curcp = mycp;
815 
816 		/*
817 		 * Take daemon off of free queue
818 		 */
819 		if (aiop->aiothreadflags & AIOP_FREE) {
820 			s = splnet();
821 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
822 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
823 			aiop->aiothreadflags &= ~AIOP_FREE;
824 			splx(s);
825 		}
826 		aiop->aiothreadflags &= ~AIOP_SCHED;
827 
828 		/*
829 		 * Check for jobs.
830 		 */
831 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
832 			cb = &aiocbe->uaiocb;
833 			userp = aiocbe->userproc;
834 
835 			aiocbe->jobstate = JOBST_JOBRUNNING;
836 
837 			/*
838 			 * Connect to process address space for user program.
839 			 */
840 			if (userp != curcp) {
841 				/*
842 				 * Save the current address space that we are
843 				 * connected to.
844 				 */
845 				tmpvm = mycp->p_vmspace;
846 
847 				/*
848 				 * Point to the new user address space, and
849 				 * refer to it.
850 				 */
851 				mycp->p_vmspace = userp->p_vmspace;
852 				mycp->p_vmspace->vm_refcnt++;
853 
854 				/* Activate the new mapping. */
855 				pmap_activate(&mycp->p_thread);
856 
857 				/*
858 				 * If the old address space wasn't the daemons
859 				 * own address space, then we need to remove the
860 				 * daemon's reference from the other process
861 				 * that it was acting on behalf of.
862 				 */
863 				if (tmpvm != myvm) {
864 					vmspace_free(tmpvm);
865 				}
866 
867 				/*
868 				 * Disassociate from previous clients file
869 				 * descriptors, and associate to the new clients
870 				 * descriptors.  Note that the daemon doesn't
871 				 * need to worry about its orginal descriptors,
872 				 * because they were originally freed.
873 				 */
874 				if (mycp->p_fd)
875 					fdfree(td);
876 				mycp->p_fd = fdshare(userp);
877 				curcp = userp;
878 			}
879 
880 			ki = userp->p_aioinfo;
881 			lj = aiocbe->lio;
882 
883 			/* Account for currently active jobs. */
884 			ki->kaio_active_count++;
885 
886 			/* Do the I/O function. */
887 			aiocbe->jobaiothread = aiop;
888 			aio_process(aiocbe);
889 
890 			/* Decrement the active job count. */
891 			ki->kaio_active_count--;
892 
893 			/*
894 			 * Increment the completion count for wakeup/signal
895 			 * comparisons.
896 			 */
897 			aiocbe->jobflags |= AIOCBLIST_DONE;
898 			ki->kaio_queue_finished_count++;
899 			if (lj)
900 				lj->lioj_queue_finished_count++;
901 			if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
902 			    & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
903 				ki->kaio_flags &= ~KAIO_WAKEUP;
904 				wakeup(userp);
905 			}
906 
907 			s = splbio();
908 			if (lj && (lj->lioj_flags &
909 			    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
910 				if ((lj->lioj_queue_finished_count ==
911 				    lj->lioj_queue_count) &&
912 				    (lj->lioj_buffer_finished_count ==
913 				    lj->lioj_buffer_count)) {
914 					PROC_LOCK(userp);
915 					psignal(userp,
916 					    lj->lioj_signal.sigev_signo);
917 					PROC_UNLOCK(userp);
918 					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
919 				}
920 			}
921 			splx(s);
922 
923 			aiocbe->jobstate = JOBST_JOBFINISHED;
924 
925 			/*
926 			 * If the I/O request should be automatically rundown,
927 			 * do the needed cleanup.  Otherwise, place the queue
928 			 * entry for the just finished I/O request into the done
929 			 * queue for the associated client.
930 			 */
931 			s = splnet();
932 			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
933 				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
934 				zfree(aiocb_zone, aiocbe);
935 			} else {
936 				TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
937 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
938 				    plist);
939 			}
940 			splx(s);
941 			KNOTE(&aiocbe->klist, 0);
942 
943 			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
944 				wakeup(aiocbe);
945 				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
946 			}
947 
948 			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
949 				PROC_LOCK(userp);
950 				psignal(userp, cb->aio_sigevent.sigev_signo);
951 				PROC_UNLOCK(userp);
952 			}
953 		}
954 
955 		/*
956 		 * Disconnect from user address space.
957 		 */
958 		if (curcp != mycp) {
959 			/* Get the user address space to disconnect from. */
960 			tmpvm = mycp->p_vmspace;
961 
962 			/* Get original address space for daemon. */
963 			mycp->p_vmspace = myvm;
964 
965 			/* Activate the daemon's address space. */
966 			pmap_activate(&mycp->p_thread);
967 #ifdef DIAGNOSTIC
968 			if (tmpvm == myvm) {
969 				printf("AIOD: vmspace problem -- %d\n",
970 				    mycp->p_pid);
971 			}
972 #endif
973 			/* Remove our vmspace reference. */
974 			vmspace_free(tmpvm);
975 
976 			/*
977 			 * Disassociate from the user process's file
978 			 * descriptors.
979 			 */
980 			if (mycp->p_fd)
981 				fdfree(td);
982 			mycp->p_fd = NULL;
983 			curcp = mycp;
984 		}
985 
986 		/*
987 		 * If we are the first to be put onto the free queue, wakeup
988 		 * anyone waiting for a daemon.
989 		 */
990 		s = splnet();
991 		TAILQ_REMOVE(&aio_activeproc, aiop, list);
992 		if (TAILQ_EMPTY(&aio_freeproc))
993 			wakeup(&aio_freeproc);
994 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
995 		aiop->aiothreadflags |= AIOP_FREE;
996 		splx(s);
997 
998 		/*
999 		 * If daemon is inactive for a long time, allow it to exit,
1000 		 * thereby freeing resources.
1001 		 */
1002 		if (((aiop->aiothreadflags & AIOP_SCHED) == 0) && tsleep(mycp,
1003 		    PRIBIO, "aiordy", aiod_lifetime)) {
1004 			s = splnet();
1005 			if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
1006 			    (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
1007 				if ((aiop->aiothreadflags & AIOP_FREE) &&
1008 				    (num_aio_procs > target_aio_procs)) {
1009 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
1010 					splx(s);
1011 					zfree(aiop_zone, aiop);
1012 					num_aio_procs--;
1013 #ifdef DIAGNOSTIC
1014 					if (mycp->p_vmspace->vm_refcnt <= 1) {
1015 						printf("AIOD: bad vm refcnt for"
1016 						    " exiting daemon: %d\n",
1017 						    mycp->p_vmspace->vm_refcnt);
1018 					}
1019 #endif
1020 					kthread_exit(0);
1021 				}
1022 			}
1023 			splx(s);
1024 		}
1025 	}
1026 }
1027 
1028 /*
1029  * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
1030  * AIO daemon modifies its environment itself.
1031  */
1032 static int
1033 aio_newproc()
1034 {
1035 	int error;
1036 	struct proc *p;
1037 
1038 	error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d",
1039 			       num_aio_procs);
1040 	if (error)
1041 		return error;
1042 
1043 	/*
1044 	 * Wait until daemon is started, but continue on just in case to
1045 	 * handle error conditions.
1046 	 */
1047 	error = tsleep(p, PZERO, "aiosta", aiod_timeout);
1048 
1049 	num_aio_procs++;
1050 
1051 	return error;
1052 }
1053 
1054 /*
1055  * Try the high-performance, low-overhead physio method for eligible
1056  * VCHR devices.  This method doesn't use an aio helper thread, and
1057  * thus has very low overhead.
1058  *
1059  * Assumes that the caller, _aio_aqueue(), has incremented the file
1060  * structure's reference count, preventing its deallocation for the
1061  * duration of this call.
1062  */
1063 static int
1064 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1065 {
1066 	int error;
1067 	struct aiocb *cb;
1068 	struct file *fp;
1069 	struct buf *bp;
1070 	struct vnode *vp;
1071 	struct kaioinfo *ki;
1072 	struct filedesc *fdp;
1073 	struct aio_liojob *lj;
1074 	int fd;
1075 	int s;
1076 	int notify;
1077 
1078 	cb = &aiocbe->uaiocb;
1079 	fdp = p->p_fd;
1080 	fd = cb->aio_fildes;
1081 	fp = fdp->fd_ofiles[fd];
1082 
1083 	if (fp->f_type != DTYPE_VNODE)
1084 		return (-1);
1085 
1086 	vp = (struct vnode *)fp->f_data;
1087 
1088 	/*
1089 	 * If its not a disk, we don't want to return a positive error.
1090 	 * It causes the aio code to not fall through to try the thread
1091 	 * way when you're talking to a regular file.
1092 	 */
1093 	if (!vn_isdisk(vp, &error)) {
1094 		if (error == ENOTBLK)
1095 			return (-1);
1096 		else
1097 			return (error);
1098 	}
1099 
1100  	if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
1101 		return (-1);
1102 
1103 	if (cb->aio_nbytes >
1104 	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1105 		return (-1);
1106 
1107 	ki = p->p_aioinfo;
1108 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1109 		return (-1);
1110 
1111 	ki->kaio_buffer_count++;
1112 
1113 	lj = aiocbe->lio;
1114 	if (lj)
1115 		lj->lioj_buffer_count++;
1116 
1117 	/* Create and build a buffer header for a transfer. */
1118 	bp = (struct buf *)getpbuf(NULL);
1119 	BUF_KERNPROC(bp);
1120 
1121 	/*
1122 	 * Get a copy of the kva from the physical buffer.
1123 	 */
1124 	bp->b_caller1 = p;
1125 	bp->b_dev = vp->v_rdev;
1126 	error = bp->b_error = 0;
1127 
1128 	bp->b_bcount = cb->aio_nbytes;
1129 	bp->b_bufsize = cb->aio_nbytes;
1130 	bp->b_flags = B_PHYS;
1131 	bp->b_iodone = aio_physwakeup;
1132 	bp->b_saveaddr = bp->b_data;
1133 	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1134 	bp->b_blkno = btodb(cb->aio_offset);
1135 
1136 	if (cb->aio_lio_opcode == LIO_WRITE) {
1137 		bp->b_iocmd = BIO_WRITE;
1138 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1139 			error = EFAULT;
1140 			goto doerror;
1141 		}
1142 	} else {
1143 		bp->b_iocmd = BIO_READ;
1144 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1145 			error = EFAULT;
1146 			goto doerror;
1147 		}
1148 	}
1149 
1150 	/* Bring buffer into kernel space. */
1151 	vmapbuf(bp);
1152 
1153 	s = splbio();
1154 	aiocbe->bp = bp;
1155 	bp->b_spc = (void *)aiocbe;
1156 	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1157 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1158 	aiocbe->jobstate = JOBST_JOBQBUF;
1159 	cb->_aiocb_private.status = cb->aio_nbytes;
1160 	num_buf_aio++;
1161 	bp->b_error = 0;
1162 
1163 	splx(s);
1164 
1165 	/* Perform transfer. */
1166 	DEV_STRATEGY(bp, 0);
1167 
1168 	notify = 0;
1169 	s = splbio();
1170 
1171 	/*
1172 	 * If we had an error invoking the request, or an error in processing
1173 	 * the request before we have returned, we process it as an error in
1174 	 * transfer.  Note that such an I/O error is not indicated immediately,
1175 	 * but is returned using the aio_error mechanism.  In this case,
1176 	 * aio_suspend will return immediately.
1177 	 */
1178 	if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1179 		struct aiocb *job = aiocbe->uuaiocb;
1180 
1181 		aiocbe->uaiocb._aiocb_private.status = 0;
1182 		suword(&job->_aiocb_private.status, 0);
1183 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1184 		suword(&job->_aiocb_private.error, bp->b_error);
1185 
1186 		ki->kaio_buffer_finished_count++;
1187 
1188 		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1189 			aiocbe->jobstate = JOBST_JOBBFINISHED;
1190 			aiocbe->jobflags |= AIOCBLIST_DONE;
1191 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1192 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1193 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1194 			notify = 1;
1195 		}
1196 	}
1197 	splx(s);
1198 	if (notify)
1199 		KNOTE(&aiocbe->klist, 0);
1200 	return 0;
1201 
1202 doerror:
1203 	ki->kaio_buffer_count--;
1204 	if (lj)
1205 		lj->lioj_buffer_count--;
1206 	aiocbe->bp = NULL;
1207 	relpbuf(bp, NULL);
1208 	return error;
1209 }
1210 
1211 /*
1212  * This waits/tests physio completion.
1213  */
1214 static int
1215 aio_fphysio(struct proc *p, struct aiocblist *iocb)
1216 {
1217 	int s;
1218 	struct buf *bp;
1219 	int error;
1220 
1221 	bp = iocb->bp;
1222 
1223 	s = splbio();
1224 	while ((bp->b_flags & B_DONE) == 0) {
1225 		if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
1226 			if ((bp->b_flags & B_DONE) == 0) {
1227 				splx(s);
1228 				return EINPROGRESS;
1229 			} else
1230 				break;
1231 		}
1232 	}
1233 	splx(s);
1234 
1235 	/* Release mapping into kernel space. */
1236 	vunmapbuf(bp);
1237 	iocb->bp = 0;
1238 
1239 	error = 0;
1240 
1241 	/* Check for an error. */
1242 	if (bp->b_ioflags & BIO_ERROR)
1243 		error = bp->b_error;
1244 
1245 	relpbuf(bp, NULL);
1246 	return (error);
1247 }
1248 
1249 /*
1250  * Wake up aio requests that may be serviceable now.
1251  */
1252 static void
1253 aio_swake_cb(struct socket *so, struct sockbuf *sb)
1254 {
1255 	struct aiocblist *cb,*cbn;
1256 	struct proc *p;
1257 	struct kaioinfo *ki = NULL;
1258 	int opcode, wakecount = 0;
1259 	struct aiothreadlist *aiop;
1260 
1261 	if (sb == &so->so_snd) {
1262 		opcode = LIO_WRITE;
1263 		so->so_snd.sb_flags &= ~SB_AIO;
1264 	} else {
1265 		opcode = LIO_READ;
1266 		so->so_rcv.sb_flags &= ~SB_AIO;
1267 	}
1268 
1269 	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1270 		cbn = TAILQ_NEXT(cb, list);
1271 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1272 			p = cb->userproc;
1273 			ki = p->p_aioinfo;
1274 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1275 			TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1276 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1277 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1278 			wakecount++;
1279 			if (cb->jobstate != JOBST_JOBQGLOBAL)
1280 				panic("invalid queue value");
1281 		}
1282 	}
1283 
1284 	while (wakecount--) {
1285 		if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1286 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1287 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1288 			aiop->aiothreadflags &= ~AIOP_FREE;
1289 			wakeup(aiop->aiothread);
1290 		}
1291 	}
1292 }
1293 
1294 /*
1295  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1296  * technique is done in this code.
1297  */
1298 static int
1299 _aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
1300 {
1301 	struct proc *p = td->td_proc;
1302 	struct filedesc *fdp;
1303 	struct file *fp;
1304 	unsigned int fd;
1305 	struct socket *so;
1306 	int s;
1307 	int error;
1308 	int opcode;
1309 	struct aiocblist *aiocbe;
1310 	struct aiothreadlist *aiop;
1311 	struct kaioinfo *ki;
1312 	struct kevent kev;
1313 	struct kqueue *kq;
1314 	struct file *kq_fp;
1315 
1316 	aiocbe = zalloc(aiocb_zone);
1317 	aiocbe->inputcharge = 0;
1318 	aiocbe->outputcharge = 0;
1319 	callout_handle_init(&aiocbe->timeouthandle);
1320 	SLIST_INIT(&aiocbe->klist);
1321 
1322 	suword(&job->_aiocb_private.status, -1);
1323 	suword(&job->_aiocb_private.error, 0);
1324 	suword(&job->_aiocb_private.kernelinfo, -1);
1325 
1326 	error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1327 	if (error) {
1328 		suword(&job->_aiocb_private.error, error);
1329 		zfree(aiocb_zone, aiocbe);
1330 		return error;
1331 	}
1332 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1333 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1334 		zfree(aiocb_zone, aiocbe);
1335 		return EINVAL;
1336 	}
1337 
1338 	/* Save userspace address of the job info. */
1339 	aiocbe->uuaiocb = job;
1340 
1341 	/* Get the opcode. */
1342 	if (type != LIO_NOP)
1343 		aiocbe->uaiocb.aio_lio_opcode = type;
1344 	opcode = aiocbe->uaiocb.aio_lio_opcode;
1345 
1346 	/* Get the fd info for process. */
1347 	fdp = p->p_fd;
1348 
1349 	/*
1350 	 * Range check file descriptor.
1351 	 */
1352 	fd = aiocbe->uaiocb.aio_fildes;
1353 	if (fd >= fdp->fd_nfiles) {
1354 		zfree(aiocb_zone, aiocbe);
1355 		if (type == 0)
1356 			suword(&job->_aiocb_private.error, EBADF);
1357 		return EBADF;
1358 	}
1359 
1360 	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1361 	if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1362 	    0))) {
1363 		zfree(aiocb_zone, aiocbe);
1364 		if (type == 0)
1365 			suword(&job->_aiocb_private.error, EBADF);
1366 		return EBADF;
1367 	}
1368 
1369 	if (aiocbe->uaiocb.aio_offset == -1LL) {
1370 		zfree(aiocb_zone, aiocbe);
1371 		if (type == 0)
1372 			suword(&job->_aiocb_private.error, EINVAL);
1373 		return EINVAL;
1374 	}
1375 
1376 	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1377 	if (error) {
1378 		zfree(aiocb_zone, aiocbe);
1379 		if (type == 0)
1380 			suword(&job->_aiocb_private.error, EINVAL);
1381 		return error;
1382 	}
1383 
1384 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1385 	if (jobrefid == LONG_MAX)
1386 		jobrefid = 1;
1387 	else
1388 		jobrefid++;
1389 
1390 	if (opcode == LIO_NOP) {
1391 		zfree(aiocb_zone, aiocbe);
1392 		if (type == 0) {
1393 			suword(&job->_aiocb_private.error, 0);
1394 			suword(&job->_aiocb_private.status, 0);
1395 			suword(&job->_aiocb_private.kernelinfo, 0);
1396 		}
1397 		return 0;
1398 	}
1399 
1400 	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1401 		zfree(aiocb_zone, aiocbe);
1402 		if (type == 0) {
1403 			suword(&job->_aiocb_private.status, 0);
1404 			suword(&job->_aiocb_private.error, EINVAL);
1405 		}
1406 		return EINVAL;
1407 	}
1408 
1409 	fhold(fp);
1410 
1411 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1412 		kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1413 		kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1414 	}
1415 	else {
1416 		/*
1417 		 * This method for requesting kevent-based notification won't
1418 		 * work on the alpha, since we're passing in a pointer
1419 		 * via aio_lio_opcode, which is an int.  Use the SIGEV_KEVENT-
1420 		 * based method instead.
1421 		 */
1422 		struct kevent *kevp;
1423 
1424 		kevp = (struct kevent *)(uintptr_t)job->aio_lio_opcode;
1425 		if (kevp == NULL)
1426 			goto no_kqueue;
1427 
1428 		error = copyin(kevp, &kev, sizeof(kev));
1429 		if (error)
1430 			goto aqueue_fail;
1431 	}
1432 	if ((u_int)kev.ident >= fdp->fd_nfiles ||
1433 	    (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1434 	    (kq_fp->f_type != DTYPE_KQUEUE)) {
1435 		error = EBADF;
1436 		goto aqueue_fail;
1437 	}
1438 	kq = (struct kqueue *)kq_fp->f_data;
1439 	kev.ident = (uintptr_t)aiocbe;
1440 	kev.filter = EVFILT_AIO;
1441 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1442 	error = kqueue_register(kq, &kev, td);
1443 aqueue_fail:
1444 	if (error) {
1445 		zfree(aiocb_zone, aiocbe);
1446 		if (type == 0)
1447 			suword(&job->_aiocb_private.error, error);
1448 		goto done;
1449 	}
1450 no_kqueue:
1451 
1452 	suword(&job->_aiocb_private.error, EINPROGRESS);
1453 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1454 	aiocbe->userproc = p;
1455 	aiocbe->jobflags = 0;
1456 	aiocbe->lio = lj;
1457 	ki = p->p_aioinfo;
1458 
1459 	if (fp->f_type == DTYPE_SOCKET) {
1460 		/*
1461 		 * Alternate queueing for socket ops: Reach down into the
1462 		 * descriptor to get the socket data.  Then check to see if the
1463 		 * socket is ready to be read or written (based on the requested
1464 		 * operation).
1465 		 *
1466 		 * If it is not ready for io, then queue the aiocbe on the
1467 		 * socket, and set the flags so we get a call when sbnotify()
1468 		 * happens.
1469 		 */
1470 		so = (struct socket *)fp->f_data;
1471 		s = splnet();
1472 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1473 		    LIO_WRITE) && (!sowriteable(so)))) {
1474 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1475 			TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1476 			if (opcode == LIO_READ)
1477 				so->so_rcv.sb_flags |= SB_AIO;
1478 			else
1479 				so->so_snd.sb_flags |= SB_AIO;
1480 			aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1481 			ki->kaio_queue_count++;
1482 			num_queue_count++;
1483 			splx(s);
1484 			error = 0;
1485 			goto done;
1486 		}
1487 		splx(s);
1488 	}
1489 
1490 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1491 		goto done;
1492 	if (error > 0) {
1493 		suword(&job->_aiocb_private.status, 0);
1494 		aiocbe->uaiocb._aiocb_private.error = error;
1495 		suword(&job->_aiocb_private.error, error);
1496 		goto done;
1497 	}
1498 
1499 	/* No buffer for daemon I/O. */
1500 	aiocbe->bp = NULL;
1501 
1502 	ki->kaio_queue_count++;
1503 	if (lj)
1504 		lj->lioj_queue_count++;
1505 	s = splnet();
1506 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1507 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1508 	splx(s);
1509 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1510 
1511 	num_queue_count++;
1512 	error = 0;
1513 
1514 	/*
1515 	 * If we don't have a free AIO process, and we are below our quota, then
1516 	 * start one.  Otherwise, depend on the subsequent I/O completions to
1517 	 * pick-up this job.  If we don't sucessfully create the new process
1518 	 * (thread) due to resource issues, we return an error for now (EAGAIN),
1519 	 * which is likely not the correct thing to do.
1520 	 */
1521 retryproc:
1522 	s = splnet();
1523 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1524 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1525 		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1526 		aiop->aiothreadflags &= ~AIOP_FREE;
1527 		wakeup(aiop->aiothread);
1528 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1529 	    ((ki->kaio_active_count + num_aio_resv_start) <
1530 	    ki->kaio_maxactive_count)) {
1531 		num_aio_resv_start++;
1532 		if ((error = aio_newproc()) == 0) {
1533 			num_aio_resv_start--;
1534 			td->td_retval[0] = 0;
1535 			goto retryproc;
1536 		}
1537 		num_aio_resv_start--;
1538 	}
1539 	splx(s);
1540 done:
1541 	fdrop(fp, td);
1542 	return error;
1543 }
1544 
1545 /*
1546  * This routine queues an AIO request, checking for quotas.
1547  */
1548 static int
1549 aio_aqueue(struct thread *td, struct aiocb *job, int type)
1550 {
1551 	struct proc *p = td->td_proc;
1552 	struct kaioinfo *ki;
1553 
1554 	if (p->p_aioinfo == NULL)
1555 		aio_init_aioinfo(p);
1556 
1557 	if (num_queue_count >= max_queue_count)
1558 		return EAGAIN;
1559 
1560 	ki = p->p_aioinfo;
1561 	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1562 		return EAGAIN;
1563 
1564 	return _aio_aqueue(td, job, NULL, type);
1565 }
1566 
1567 /*
1568  * Support the aio_return system call, as a side-effect, kernel resources are
1569  * released.
1570  */
1571 int
1572 aio_return(struct thread *td, struct aio_return_args *uap)
1573 {
1574 	struct proc *p = td->td_proc;
1575 	int s;
1576 	int jobref;
1577 	struct aiocblist *cb, *ncb;
1578 	struct aiocb *ujob;
1579 	struct kaioinfo *ki;
1580 
1581 	ki = p->p_aioinfo;
1582 	if (ki == NULL)
1583 		return EINVAL;
1584 
1585 	ujob = uap->aiocbp;
1586 
1587 	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1588 	if (jobref == -1 || jobref == 0)
1589 		return EINVAL;
1590 
1591 	s = splnet();
1592 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1593 	    plist)) {
1594 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1595 		    jobref) {
1596 			splx(s);
1597 			if (ujob == cb->uuaiocb) {
1598 				td->td_retval[0] =
1599 				    cb->uaiocb._aiocb_private.status;
1600 			} else
1601 				td->td_retval[0] = EFAULT;
1602 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1603 				p->p_stats->p_ru.ru_oublock +=
1604 				    cb->outputcharge;
1605 				cb->outputcharge = 0;
1606 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1607 				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
1608 				cb->inputcharge = 0;
1609 			}
1610 			aio_free_entry(cb);
1611 			return 0;
1612 		}
1613 	}
1614 	splx(s);
1615 
1616 	s = splbio();
1617 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1618 		ncb = TAILQ_NEXT(cb, plist);
1619 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1620 		    == jobref) {
1621 			splx(s);
1622 			if (ujob == cb->uuaiocb) {
1623 				td->td_retval[0] =
1624 				    cb->uaiocb._aiocb_private.status;
1625 			} else
1626 				td->td_retval[0] = EFAULT;
1627 			aio_free_entry(cb);
1628 			return 0;
1629 		}
1630 	}
1631 	splx(s);
1632 
1633 	return (EINVAL);
1634 }
1635 
1636 /*
1637  * Allow a process to wakeup when any of the I/O requests are completed.
1638  */
1639 int
1640 aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1641 {
1642 	struct proc *p = td->td_proc;
1643 	struct timeval atv;
1644 	struct timespec ts;
1645 	struct aiocb *const *cbptr, *cbp;
1646 	struct kaioinfo *ki;
1647 	struct aiocblist *cb;
1648 	int i;
1649 	int njoblist;
1650 	int error, s, timo;
1651 	int *ijoblist;
1652 	struct aiocb **ujoblist;
1653 
1654 	if (uap->nent > AIO_LISTIO_MAX)
1655 		return EINVAL;
1656 
1657 	timo = 0;
1658 	if (uap->timeout) {
1659 		/* Get timespec struct. */
1660 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1661 			return error;
1662 
1663 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1664 			return (EINVAL);
1665 
1666 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
1667 		if (itimerfix(&atv))
1668 			return (EINVAL);
1669 		timo = tvtohz(&atv);
1670 	}
1671 
1672 	ki = p->p_aioinfo;
1673 	if (ki == NULL)
1674 		return EAGAIN;
1675 
1676 	njoblist = 0;
1677 	ijoblist = zalloc(aiol_zone);
1678 	ujoblist = zalloc(aiol_zone);
1679 	cbptr = uap->aiocbp;
1680 
1681 	for (i = 0; i < uap->nent; i++) {
1682 		cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
1683 		if (cbp == 0)
1684 			continue;
1685 		ujoblist[njoblist] = cbp;
1686 		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1687 		njoblist++;
1688 	}
1689 
1690 	if (njoblist == 0) {
1691 		zfree(aiol_zone, ijoblist);
1692 		zfree(aiol_zone, ujoblist);
1693 		return 0;
1694 	}
1695 
1696 	error = 0;
1697 	for (;;) {
1698 		for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb =
1699 		    TAILQ_NEXT(cb, plist)) {
1700 			for (i = 0; i < njoblist; i++) {
1701 				if (((intptr_t)
1702 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1703 				    ijoblist[i]) {
1704 					if (ujoblist[i] != cb->uuaiocb)
1705 						error = EINVAL;
1706 					zfree(aiol_zone, ijoblist);
1707 					zfree(aiol_zone, ujoblist);
1708 					return error;
1709 				}
1710 			}
1711 		}
1712 
1713 		s = splbio();
1714 		for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1715 		    TAILQ_NEXT(cb, plist)) {
1716 			for (i = 0; i < njoblist; i++) {
1717 				if (((intptr_t)
1718 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1719 				    ijoblist[i]) {
1720 					splx(s);
1721 					if (ujoblist[i] != cb->uuaiocb)
1722 						error = EINVAL;
1723 					zfree(aiol_zone, ijoblist);
1724 					zfree(aiol_zone, ujoblist);
1725 					return error;
1726 				}
1727 			}
1728 		}
1729 
1730 		ki->kaio_flags |= KAIO_WAKEUP;
1731 		error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1732 		splx(s);
1733 
1734 		if (error == ERESTART || error == EINTR) {
1735 			zfree(aiol_zone, ijoblist);
1736 			zfree(aiol_zone, ujoblist);
1737 			return EINTR;
1738 		} else if (error == EWOULDBLOCK) {
1739 			zfree(aiol_zone, ijoblist);
1740 			zfree(aiol_zone, ujoblist);
1741 			return EAGAIN;
1742 		}
1743 	}
1744 
1745 /* NOTREACHED */
1746 	return EINVAL;
1747 }
1748 
1749 /*
1750  * aio_cancel cancels any non-physio aio operations not currently in
1751  * progress.
1752  */
1753 int
1754 aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1755 {
1756 	struct proc *p = td->td_proc;
1757 	struct kaioinfo *ki;
1758 	struct aiocblist *cbe, *cbn;
1759 	struct file *fp;
1760 	struct filedesc *fdp;
1761 	struct socket *so;
1762 	struct proc *po;
1763 	int s,error;
1764 	int cancelled=0;
1765 	int notcancelled=0;
1766 	struct vnode *vp;
1767 
1768 	fdp = p->p_fd;
1769 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
1770 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
1771 		return (EBADF);
1772 
1773         if (fp->f_type == DTYPE_VNODE) {
1774 		vp = (struct vnode *)fp->f_data;
1775 
1776 		if (vn_isdisk(vp,&error)) {
1777 			td->td_retval[0] = AIO_NOTCANCELED;
1778         	        return 0;
1779 		}
1780 	} else if (fp->f_type == DTYPE_SOCKET) {
1781 		so = (struct socket *)fp->f_data;
1782 
1783 		s = splnet();
1784 
1785 		for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1786 			cbn = TAILQ_NEXT(cbe, list);
1787 			if ((uap->aiocbp == NULL) ||
1788 				(uap->aiocbp == cbe->uuaiocb) ) {
1789 				po = cbe->userproc;
1790 				ki = po->p_aioinfo;
1791 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1792 				TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1793 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1794 				if (ki->kaio_flags & KAIO_WAKEUP) {
1795 					wakeup(po);
1796 				}
1797 				cbe->jobstate = JOBST_JOBFINISHED;
1798 				cbe->uaiocb._aiocb_private.status=-1;
1799 				cbe->uaiocb._aiocb_private.error=ECANCELED;
1800 				cancelled++;
1801 /* XXX cancelled, knote? */
1802 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1803 				    SIGEV_SIGNAL) {
1804 					PROC_LOCK(cbe->userproc);
1805 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1806 					PROC_UNLOCK(cbe->userproc);
1807 				}
1808 				if (uap->aiocbp)
1809 					break;
1810 			}
1811 		}
1812 		splx(s);
1813 
1814 		if ((cancelled) && (uap->aiocbp)) {
1815 			td->td_retval[0] = AIO_CANCELED;
1816 			return 0;
1817 		}
1818 	}
1819 	ki=p->p_aioinfo;
1820 	s = splnet();
1821 
1822 	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1823 		cbn = TAILQ_NEXT(cbe, plist);
1824 
1825 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1826 		    ((uap->aiocbp == NULL ) ||
1827 		     (uap->aiocbp == cbe->uuaiocb))) {
1828 
1829 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1830 				TAILQ_REMOVE(&aio_jobs, cbe, list);
1831                                 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1832                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1833                                     plist);
1834 				cancelled++;
1835 				ki->kaio_queue_finished_count++;
1836 				cbe->jobstate = JOBST_JOBFINISHED;
1837 				cbe->uaiocb._aiocb_private.status = -1;
1838 				cbe->uaiocb._aiocb_private.error = ECANCELED;
1839 /* XXX cancelled, knote? */
1840 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1841 				    SIGEV_SIGNAL) {
1842 					PROC_LOCK(cbe->userproc);
1843 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1844 					PROC_UNLOCK(cbe->userproc);
1845 				}
1846 			} else {
1847 				notcancelled++;
1848 			}
1849 		}
1850 	}
1851 	splx(s);
1852 
1853 	if (notcancelled) {
1854 		td->td_retval[0] = AIO_NOTCANCELED;
1855 		return 0;
1856 	}
1857 	if (cancelled) {
1858 		td->td_retval[0] = AIO_CANCELED;
1859 		return 0;
1860 	}
1861 	td->td_retval[0] = AIO_ALLDONE;
1862 
1863 	return 0;
1864 }
1865 
1866 /*
1867  * aio_error is implemented in the kernel level for compatibility purposes only.
1868  * For a user mode async implementation, it would be best to do it in a userland
1869  * subroutine.
1870  */
1871 int
1872 aio_error(struct thread *td, struct aio_error_args *uap)
1873 {
1874 	struct proc *p = td->td_proc;
1875 	int s;
1876 	struct aiocblist *cb;
1877 	struct kaioinfo *ki;
1878 	int jobref;
1879 
1880 	ki = p->p_aioinfo;
1881 	if (ki == NULL)
1882 		return EINVAL;
1883 
1884 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1885 	if ((jobref == -1) || (jobref == 0))
1886 		return EINVAL;
1887 
1888 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1889 	    plist)) {
1890 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1891 		    jobref) {
1892 			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1893 			return 0;
1894 		}
1895 	}
1896 
1897 	s = splnet();
1898 
1899 	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1900 	    plist)) {
1901 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1902 		    jobref) {
1903 			td->td_retval[0] = EINPROGRESS;
1904 			splx(s);
1905 			return 0;
1906 		}
1907 	}
1908 
1909 	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1910 	    plist)) {
1911 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1912 		    jobref) {
1913 			td->td_retval[0] = EINPROGRESS;
1914 			splx(s);
1915 			return 0;
1916 		}
1917 	}
1918 	splx(s);
1919 
1920 	s = splbio();
1921 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1922 	    plist)) {
1923 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1924 		    jobref) {
1925 			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1926 			splx(s);
1927 			return 0;
1928 		}
1929 	}
1930 
1931 	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1932 	    plist)) {
1933 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1934 		    jobref) {
1935 			td->td_retval[0] = EINPROGRESS;
1936 			splx(s);
1937 			return 0;
1938 		}
1939 	}
1940 	splx(s);
1941 
1942 #if (0)
1943 	/*
1944 	 * Hack for lio.
1945 	 */
1946 	status = fuword(&uap->aiocbp->_aiocb_private.status);
1947 	if (status == -1)
1948 		return fuword(&uap->aiocbp->_aiocb_private.error);
1949 #endif
1950 	return EINVAL;
1951 }
1952 
1953 int
1954 aio_read(struct thread *td, struct aio_read_args *uap)
1955 {
1956 
1957 	return aio_aqueue(td, uap->aiocbp, LIO_READ);
1958 }
1959 
1960 int
1961 aio_write(struct thread *td, struct aio_write_args *uap)
1962 {
1963 
1964 	return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
1965 }
1966 
1967 int
1968 lio_listio(struct thread *td, struct lio_listio_args *uap)
1969 {
1970 	struct proc *p = td->td_proc;
1971 	int nent, nentqueued;
1972 	struct aiocb *iocb, * const *cbptr;
1973 	struct aiocblist *cb;
1974 	struct kaioinfo *ki;
1975 	struct aio_liojob *lj;
1976 	int error, runningcode;
1977 	int nerror;
1978 	int i;
1979 	int s;
1980 
1981 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1982 		return EINVAL;
1983 
1984 	nent = uap->nent;
1985 	if (nent > AIO_LISTIO_MAX)
1986 		return EINVAL;
1987 
1988 	if (p->p_aioinfo == NULL)
1989 		aio_init_aioinfo(p);
1990 
1991 	if ((nent + num_queue_count) > max_queue_count)
1992 		return EAGAIN;
1993 
1994 	ki = p->p_aioinfo;
1995 	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1996 		return EAGAIN;
1997 
1998 	lj = zalloc(aiolio_zone);
1999 	if (!lj)
2000 		return EAGAIN;
2001 
2002 	lj->lioj_flags = 0;
2003 	lj->lioj_buffer_count = 0;
2004 	lj->lioj_buffer_finished_count = 0;
2005 	lj->lioj_queue_count = 0;
2006 	lj->lioj_queue_finished_count = 0;
2007 	lj->lioj_ki = ki;
2008 
2009 	/*
2010 	 * Setup signal.
2011 	 */
2012 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2013 		error = copyin(uap->sig, &lj->lioj_signal,
2014 			       sizeof(lj->lioj_signal));
2015 		if (error) {
2016 			zfree(aiolio_zone, lj);
2017 			return error;
2018 		}
2019 		if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2020 			zfree(aiolio_zone, lj);
2021 			return EINVAL;
2022 		}
2023 		lj->lioj_flags |= LIOJ_SIGNAL;
2024 		lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
2025 	} else
2026 		lj->lioj_flags &= ~LIOJ_SIGNAL;
2027 
2028 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2029 	/*
2030 	 * Get pointers to the list of I/O requests.
2031 	 */
2032 	nerror = 0;
2033 	nentqueued = 0;
2034 	cbptr = uap->acb_list;
2035 	for (i = 0; i < uap->nent; i++) {
2036 		iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2037 		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
2038 			error = _aio_aqueue(td, iocb, lj, 0);
2039 			if (error == 0)
2040 				nentqueued++;
2041 			else
2042 				nerror++;
2043 		}
2044 	}
2045 
2046 	/*
2047 	 * If we haven't queued any, then just return error.
2048 	 */
2049 	if (nentqueued == 0)
2050 		return 0;
2051 
2052 	/*
2053 	 * Calculate the appropriate error return.
2054 	 */
2055 	runningcode = 0;
2056 	if (nerror)
2057 		runningcode = EIO;
2058 
2059 	if (uap->mode == LIO_WAIT) {
2060 		int command, found, jobref;
2061 
2062 		for (;;) {
2063 			found = 0;
2064 			for (i = 0; i < uap->nent; i++) {
2065 				/*
2066 				 * Fetch address of the control buf pointer in
2067 				 * user space.
2068 				 */
2069 				iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2070 				if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2071 				    == 0))
2072 					continue;
2073 
2074 				/*
2075 				 * Fetch the associated command from user space.
2076 				 */
2077 				command = fuword(&iocb->aio_lio_opcode);
2078 				if (command == LIO_NOP) {
2079 					found++;
2080 					continue;
2081 				}
2082 
2083 				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2084 
2085 				TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2086 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2087 					    == jobref) {
2088 						if (cb->uaiocb.aio_lio_opcode
2089 						    == LIO_WRITE) {
2090 							p->p_stats->p_ru.ru_oublock
2091 							    +=
2092 							    cb->outputcharge;
2093 							cb->outputcharge = 0;
2094 						} else if (cb->uaiocb.aio_lio_opcode
2095 						    == LIO_READ) {
2096 							p->p_stats->p_ru.ru_inblock
2097 							    += cb->inputcharge;
2098 							cb->inputcharge = 0;
2099 						}
2100 						found++;
2101 						break;
2102 					}
2103 				}
2104 
2105 				s = splbio();
2106 				TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2107 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2108 					    == jobref) {
2109 						found++;
2110 						break;
2111 					}
2112 				}
2113 				splx(s);
2114 			}
2115 
2116 			/*
2117 			 * If all I/Os have been disposed of, then we can
2118 			 * return.
2119 			 */
2120 			if (found == nentqueued)
2121 				return runningcode;
2122 
2123 			ki->kaio_flags |= KAIO_WAKEUP;
2124 			error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2125 
2126 			if (error == EINTR)
2127 				return EINTR;
2128 			else if (error == EWOULDBLOCK)
2129 				return EAGAIN;
2130 		}
2131 	}
2132 
2133 	return runningcode;
2134 }
2135 
2136 /*
2137  * This is a weird hack so that we can post a signal.  It is safe to do so from
2138  * a timeout routine, but *not* from an interrupt routine.
2139  */
2140 static void
2141 process_signal(void *aioj)
2142 {
2143 	struct aiocblist *aiocbe = aioj;
2144 	struct aio_liojob *lj = aiocbe->lio;
2145 	struct aiocb *cb = &aiocbe->uaiocb;
2146 
2147 	if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2148 		(lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2149 		PROC_LOCK(lj->lioj_ki->kaio_p);
2150 		psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2151 		PROC_UNLOCK(lj->lioj_ki->kaio_p);
2152 		lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2153 	}
2154 
2155 	if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2156 		PROC_LOCK(aiocbe->userproc);
2157 		psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2158 		PROC_UNLOCK(aiocbe->userproc);
2159 	}
2160 }
2161 
2162 /*
2163  * Interrupt handler for physio, performs the necessary process wakeups, and
2164  * signals.
2165  */
2166 static void
2167 aio_physwakeup(struct buf *bp)
2168 {
2169 	struct aiocblist *aiocbe;
2170 	struct proc *p;
2171 	struct kaioinfo *ki;
2172 	struct aio_liojob *lj;
2173 
2174 	wakeup(bp);
2175 
2176 	aiocbe = (struct aiocblist *)bp->b_spc;
2177 	if (aiocbe) {
2178 		p = bp->b_caller1;
2179 
2180 		aiocbe->jobstate = JOBST_JOBBFINISHED;
2181 		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2182 		aiocbe->uaiocb._aiocb_private.error = 0;
2183 		aiocbe->jobflags |= AIOCBLIST_DONE;
2184 
2185 		if (bp->b_ioflags & BIO_ERROR)
2186 			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2187 
2188 		lj = aiocbe->lio;
2189 		if (lj) {
2190 			lj->lioj_buffer_finished_count++;
2191 
2192 			/*
2193 			 * wakeup/signal if all of the interrupt jobs are done.
2194 			 */
2195 			if (lj->lioj_buffer_finished_count ==
2196 			    lj->lioj_buffer_count) {
2197 				/*
2198 				 * Post a signal if it is called for.
2199 				 */
2200 				if ((lj->lioj_flags &
2201 				    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2202 				    LIOJ_SIGNAL) {
2203 					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2204 					aiocbe->timeouthandle =
2205 						timeout(process_signal,
2206 							aiocbe, 0);
2207 				}
2208 			}
2209 		}
2210 
2211 		ki = p->p_aioinfo;
2212 		if (ki) {
2213 			ki->kaio_buffer_finished_count++;
2214 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2215 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2216 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2217 
2218 			KNOTE(&aiocbe->klist, 0);
2219 			/* Do the wakeup. */
2220 			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2221 				ki->kaio_flags &= ~KAIO_WAKEUP;
2222 				wakeup(p);
2223 			}
2224 		}
2225 
2226 		if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2227 			aiocbe->timeouthandle =
2228 				timeout(process_signal, aiocbe, 0);
2229 	}
2230 }
2231 
2232 int
2233 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2234 {
2235 	struct proc *p = td->td_proc;
2236 	struct timeval atv;
2237 	struct timespec ts;
2238 	struct aiocb **cbptr;
2239 	struct kaioinfo *ki;
2240 	struct aiocblist *cb = NULL;
2241 	int error, s, timo;
2242 
2243 	suword(uap->aiocbp, (int)NULL);
2244 
2245 	timo = 0;
2246 	if (uap->timeout) {
2247 		/* Get timespec struct. */
2248 		error = copyin(uap->timeout, &ts, sizeof(ts));
2249 		if (error)
2250 			return error;
2251 
2252 		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2253 			return (EINVAL);
2254 
2255 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
2256 		if (itimerfix(&atv))
2257 			return (EINVAL);
2258 		timo = tvtohz(&atv);
2259 	}
2260 
2261 	ki = p->p_aioinfo;
2262 	if (ki == NULL)
2263 		return EAGAIN;
2264 
2265 	cbptr = uap->aiocbp;
2266 
2267 	for (;;) {
2268 		if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2269 			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2270 			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2271 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2272 				p->p_stats->p_ru.ru_oublock +=
2273 				    cb->outputcharge;
2274 				cb->outputcharge = 0;
2275 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2276 				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
2277 				cb->inputcharge = 0;
2278 			}
2279 			aio_free_entry(cb);
2280 			return cb->uaiocb._aiocb_private.error;
2281 		}
2282 
2283 		s = splbio();
2284  		if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2285 			splx(s);
2286 			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2287 			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2288 			aio_free_entry(cb);
2289 			return cb->uaiocb._aiocb_private.error;
2290 		}
2291 
2292 		ki->kaio_flags |= KAIO_WAKEUP;
2293 		error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2294 		splx(s);
2295 
2296 		if (error == ERESTART)
2297 			return EINTR;
2298 		else if (error < 0)
2299 			return error;
2300 		else if (error == EINTR)
2301 			return EINTR;
2302 		else if (error == EWOULDBLOCK)
2303 			return EAGAIN;
2304 	}
2305 }
2306 
2307 static int
2308 filt_aioattach(struct knote *kn)
2309 {
2310 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2311 
2312 	/*
2313 	 * The aiocbe pointer must be validated before using it, so
2314 	 * registration is restricted to the kernel; the user cannot
2315 	 * set EV_FLAG1.
2316 	 */
2317 	if ((kn->kn_flags & EV_FLAG1) == 0)
2318 		return (EPERM);
2319 	kn->kn_flags &= ~EV_FLAG1;
2320 
2321 	SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2322 
2323 	return (0);
2324 }
2325 
2326 static void
2327 filt_aiodetach(struct knote *kn)
2328 {
2329 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2330 
2331 	SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2332 }
2333 
2334 /*ARGSUSED*/
2335 static int
2336 filt_aio(struct knote *kn, long hint)
2337 {
2338 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2339 
2340 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2341 	if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2342 	    aiocbe->jobstate != JOBST_JOBBFINISHED)
2343 		return (0);
2344 	kn->kn_flags |= EV_EOF;
2345 	return (1);
2346 }
2347