xref: /freebsd/sys/kern/vfs_aio.c (revision a3e8fd0b7f663db7eafff527d5c3ca3bcfa8a537)
1 /*
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  *
16  * $FreeBSD$
17  */
18 
19 /*
20  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21  */
22 
23 #include <sys/param.h>
24 #include <sys/systm.h>
25 #include <sys/malloc.h>
26 #include <sys/bio.h>
27 #include <sys/buf.h>
28 #include <sys/sysproto.h>
29 #include <sys/filedesc.h>
30 #include <sys/kernel.h>
31 #include <sys/kthread.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/lock.h>
35 #include <sys/mutex.h>
36 #include <sys/unistd.h>
37 #include <sys/proc.h>
38 #include <sys/resourcevar.h>
39 #include <sys/signalvar.h>
40 #include <sys/protosw.h>
41 #include <sys/socketvar.h>
42 #include <sys/syscall.h>
43 #include <sys/sysent.h>
44 #include <sys/sysctl.h>
45 #include <sys/sx.h>
46 #include <sys/vnode.h>
47 #include <sys/conf.h>
48 #include <sys/event.h>
49 
50 #include <vm/vm.h>
51 #include <vm/vm_extern.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_map.h>
54 #include <vm/uma.h>
55 #include <sys/aio.h>
56 
57 #include <machine/limits.h>
58 
59 #include "opt_vfs_aio.h"
60 
61 /*
62  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
63  * overflow.
64  */
65 static	long jobrefid;
66 
67 #define JOBST_NULL		0x0
68 #define JOBST_JOBQGLOBAL	0x2
69 #define JOBST_JOBRUNNING	0x3
70 #define JOBST_JOBFINISHED	0x4
71 #define	JOBST_JOBQBUF		0x5
72 #define	JOBST_JOBBFINISHED	0x6
73 
74 #ifndef MAX_AIO_PER_PROC
75 #define MAX_AIO_PER_PROC	32
76 #endif
77 
78 #ifndef MAX_AIO_QUEUE_PER_PROC
79 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
80 #endif
81 
82 #ifndef MAX_AIO_PROCS
83 #define MAX_AIO_PROCS		32
84 #endif
85 
86 #ifndef MAX_AIO_QUEUE
87 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
88 #endif
89 
90 #ifndef TARGET_AIO_PROCS
91 #define TARGET_AIO_PROCS	4
92 #endif
93 
94 #ifndef MAX_BUF_AIO
95 #define MAX_BUF_AIO		16
96 #endif
97 
98 #ifndef AIOD_TIMEOUT_DEFAULT
99 #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
100 #endif
101 
102 #ifndef AIOD_LIFETIME_DEFAULT
103 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
104 #endif
105 
106 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
107 
108 static int max_aio_procs = MAX_AIO_PROCS;
109 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
110 	CTLFLAG_RW, &max_aio_procs, 0,
111 	"Maximum number of kernel threads to use for handling async IO ");
112 
113 static int num_aio_procs = 0;
114 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
115 	CTLFLAG_RD, &num_aio_procs, 0,
116 	"Number of presently active kernel threads for async IO");
117 
118 /*
119  * The code will adjust the actual number of AIO processes towards this
120  * number when it gets a chance.
121  */
122 static int target_aio_procs = TARGET_AIO_PROCS;
123 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
124 	0, "Preferred number of ready kernel threads for async IO");
125 
126 static int max_queue_count = MAX_AIO_QUEUE;
127 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
128     "Maximum number of aio requests to queue, globally");
129 
130 static int num_queue_count = 0;
131 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
132     "Number of queued aio requests");
133 
134 static int num_buf_aio = 0;
135 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
136     "Number of aio requests presently handled by the buf subsystem");
137 
138 /* Number of async I/O thread in the process of being started */
139 /* XXX This should be local to _aio_aqueue() */
140 static int num_aio_resv_start = 0;
141 
142 static int aiod_timeout;
143 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
144     "Timeout value for synchronous aio operations");
145 
146 static int aiod_lifetime;
147 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
148     "Maximum lifetime for idle aiod");
149 
150 static int unloadable = 0;
151 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
152     "Allow unload of aio (not recommended)");
153 
154 
155 static int max_aio_per_proc = MAX_AIO_PER_PROC;
156 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
157     0, "Maximum active aio requests per process (stored in the process)");
158 
159 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
160 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
161     &max_aio_queue_per_proc, 0,
162     "Maximum queued aio requests per process (stored in the process)");
163 
164 static int max_buf_aio = MAX_BUF_AIO;
165 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
166     "Maximum buf aio requests per process (stored in the process)");
167 
168 struct aiocblist {
169         TAILQ_ENTRY(aiocblist) list;	/* List of jobs */
170         TAILQ_ENTRY(aiocblist) plist;	/* List of jobs for proc */
171         int	jobflags;
172         int	jobstate;
173 	int	inputcharge;
174 	int	outputcharge;
175 	struct	callout_handle timeouthandle;
176         struct	buf *bp;		/* Buffer pointer */
177         struct	proc *userproc;		/* User process */ /* Not td! */
178         struct	file *fd_file;		/* Pointer to file structure */
179         struct	aio_liojob *lio;	/* Optional lio job */
180         struct	aiocb *uuaiocb;		/* Pointer in userspace of aiocb */
181 	struct	klist klist;		/* list of knotes */
182         struct	aiocb uaiocb;		/* Kernel I/O control block */
183 };
184 
185 /* jobflags */
186 #define AIOCBLIST_RUNDOWN       0x4
187 #define AIOCBLIST_DONE          0x10
188 
189 /*
190  * AIO process info
191  */
192 #define AIOP_FREE	0x1			/* proc on free queue */
193 #define AIOP_SCHED	0x2			/* proc explicitly scheduled */
194 
195 struct aiothreadlist {
196 	int aiothreadflags;			/* AIO proc flags */
197 	TAILQ_ENTRY(aiothreadlist) list;	/* List of processes */
198 	struct thread *aiothread;		/* The AIO thread */
199 };
200 
201 /*
202  * data-structure for lio signal management
203  */
204 struct aio_liojob {
205 	int	lioj_flags;
206 	int	lioj_buffer_count;
207 	int	lioj_buffer_finished_count;
208 	int	lioj_queue_count;
209 	int	lioj_queue_finished_count;
210 	struct	sigevent lioj_signal;	/* signal on all I/O done */
211 	TAILQ_ENTRY(aio_liojob) lioj_list;
212 	struct	kaioinfo *lioj_ki;
213 };
214 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
215 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
216 
217 /*
218  * per process aio data structure
219  */
220 struct kaioinfo {
221 	int	kaio_flags;		/* per process kaio flags */
222 	int	kaio_maxactive_count;	/* maximum number of AIOs */
223 	int	kaio_active_count;	/* number of currently used AIOs */
224 	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
225 	int	kaio_queue_count;	/* size of AIO queue */
226 	int	kaio_ballowed_count;	/* maximum number of buffers */
227 	int	kaio_queue_finished_count; /* number of daemon jobs finished */
228 	int	kaio_buffer_count;	/* number of physio buffers */
229 	int	kaio_buffer_finished_count; /* count of I/O done */
230 	struct 	proc *kaio_p;		/* process that uses this kaio block */
231 	TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
232 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* job queue for process */
233 	TAILQ_HEAD(,aiocblist) kaio_jobdone;	/* done queue for process */
234 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* buffer job queue for process */
235 	TAILQ_HEAD(,aiocblist) kaio_bufdone;	/* buffer done queue for process */
236 	TAILQ_HEAD(,aiocblist) kaio_sockqueue;	/* queue for aios waiting on sockets */
237 };
238 
239 #define KAIO_RUNDOWN	0x1	/* process is being run down */
240 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
241 
242 static TAILQ_HEAD(,aiothreadlist) aio_activeproc;	/* Active daemons */
243 static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* Idle daemons */
244 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
245 static TAILQ_HEAD(,aiocblist) aio_bufjobs;		/* Phys I/O job list */
246 
247 static void	aio_init_aioinfo(struct proc *p);
248 static void	aio_onceonly(void);
249 static int	aio_free_entry(struct aiocblist *aiocbe);
250 static void	aio_process(struct aiocblist *aiocbe);
251 static int	aio_newproc(void);
252 static int	aio_aqueue(struct thread *td, struct aiocb *job, int type);
253 static void	aio_physwakeup(struct buf *bp);
254 static void	aio_proc_rundown(struct proc *p);
255 static int	aio_fphysio(struct aiocblist *aiocbe);
256 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
257 static void	aio_daemon(void *uproc);
258 static void	aio_swake_cb(struct socket *, struct sockbuf *);
259 static int	aio_unload(void);
260 static void	process_signal(void *aioj);
261 static int	filt_aioattach(struct knote *kn);
262 static void	filt_aiodetach(struct knote *kn);
263 static int	filt_aio(struct knote *kn, long hint);
264 
265 /*
266  * Zones for:
267  * 	kaio	Per process async io info
268  *	aiop	async io thread data
269  *	aiocb	async io jobs
270  *	aiol	list io job pointer - internal to aio_suspend XXX
271  *	aiolio	list io jobs
272  */
273 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
274 
275 /* kqueue filters for aio */
276 static struct filterops aio_filtops =
277 	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
278 
279 /*
280  * Main operations function for use as a kernel module.
281  */
282 static int
283 aio_modload(struct module *module, int cmd, void *arg)
284 {
285 	int error = 0;
286 
287 	switch (cmd) {
288 	case MOD_LOAD:
289 		aio_onceonly();
290 		break;
291 	case MOD_UNLOAD:
292 		error = aio_unload();
293 		break;
294 	case MOD_SHUTDOWN:
295 		break;
296 	default:
297 		error = EINVAL;
298 		break;
299 	}
300 	return (error);
301 }
302 
303 static moduledata_t aio_mod = {
304 	"aio",
305 	&aio_modload,
306 	NULL
307 };
308 
309 SYSCALL_MODULE_HELPER(aio_return);
310 SYSCALL_MODULE_HELPER(aio_suspend);
311 SYSCALL_MODULE_HELPER(aio_cancel);
312 SYSCALL_MODULE_HELPER(aio_error);
313 SYSCALL_MODULE_HELPER(aio_read);
314 SYSCALL_MODULE_HELPER(aio_write);
315 SYSCALL_MODULE_HELPER(aio_waitcomplete);
316 SYSCALL_MODULE_HELPER(lio_listio);
317 
318 DECLARE_MODULE(aio, aio_mod,
319 	SI_SUB_VFS, SI_ORDER_ANY);
320 MODULE_VERSION(aio, 1);
321 
322 /*
323  * Startup initialization
324  */
325 static void
326 aio_onceonly(void)
327 {
328 
329 	/* XXX: should probably just use so->callback */
330 	aio_swake = &aio_swake_cb;
331 	at_exit(aio_proc_rundown);
332 	at_exec(aio_proc_rundown);
333 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
334 	TAILQ_INIT(&aio_freeproc);
335 	TAILQ_INIT(&aio_activeproc);
336 	TAILQ_INIT(&aio_jobs);
337 	TAILQ_INIT(&aio_bufjobs);
338 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
339 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
340 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
341 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
342 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
343 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
344 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
345 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
346 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aio_liojob), NULL,
347 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
348 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
349 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
350 	jobrefid = 1;
351 }
352 
353 /*
354  * Callback for unload of AIO when used as a module.
355  */
356 static int
357 aio_unload(void)
358 {
359 
360 	/*
361 	 * XXX: no unloads by default, it's too dangerous.
362 	 * perhaps we could do it if locked out callers and then
363 	 * did an aio_proc_rundown() on each process.
364 	 */
365 	if (!unloadable)
366 		return (EOPNOTSUPP);
367 
368 	aio_swake = NULL;
369 	rm_at_exit(aio_proc_rundown);
370 	rm_at_exec(aio_proc_rundown);
371 	kqueue_del_filteropts(EVFILT_AIO);
372 	return (0);
373 }
374 
375 /*
376  * Init the per-process aioinfo structure.  The aioinfo limits are set
377  * per-process for user limit (resource) management.
378  */
379 static void
380 aio_init_aioinfo(struct proc *p)
381 {
382 	struct kaioinfo *ki;
383 	if (p->p_aioinfo == NULL) {
384 		ki = uma_zalloc(kaio_zone, M_WAITOK);
385 		p->p_aioinfo = ki;
386 		ki->kaio_flags = 0;
387 		ki->kaio_maxactive_count = max_aio_per_proc;
388 		ki->kaio_active_count = 0;
389 		ki->kaio_qallowed_count = max_aio_queue_per_proc;
390 		ki->kaio_queue_count = 0;
391 		ki->kaio_ballowed_count = max_buf_aio;
392 		ki->kaio_buffer_count = 0;
393 		ki->kaio_buffer_finished_count = 0;
394 		ki->kaio_p = p;
395 		TAILQ_INIT(&ki->kaio_jobdone);
396 		TAILQ_INIT(&ki->kaio_jobqueue);
397 		TAILQ_INIT(&ki->kaio_bufdone);
398 		TAILQ_INIT(&ki->kaio_bufqueue);
399 		TAILQ_INIT(&ki->kaio_liojoblist);
400 		TAILQ_INIT(&ki->kaio_sockqueue);
401 	}
402 
403 	while (num_aio_procs < target_aio_procs)
404 		aio_newproc();
405 }
406 
407 /*
408  * Free a job entry.  Wait for completion if it is currently active, but don't
409  * delay forever.  If we delay, we return a flag that says that we have to
410  * restart the queue scan.
411  */
412 static int
413 aio_free_entry(struct aiocblist *aiocbe)
414 {
415 	struct kaioinfo *ki;
416 	struct aio_liojob *lj;
417 	struct proc *p;
418 	int error;
419 	int s;
420 
421 	if (aiocbe->jobstate == JOBST_NULL)
422 		panic("aio_free_entry: freeing already free job");
423 
424 	p = aiocbe->userproc;
425 	ki = p->p_aioinfo;
426 	lj = aiocbe->lio;
427 	if (ki == NULL)
428 		panic("aio_free_entry: missing p->p_aioinfo");
429 
430 	while (aiocbe->jobstate == JOBST_JOBRUNNING) {
431 		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
432 		tsleep(aiocbe, PRIBIO, "jobwai", 0);
433 	}
434 	if (aiocbe->bp == NULL) {
435 		if (ki->kaio_queue_count <= 0)
436 			panic("aio_free_entry: process queue size <= 0");
437 		if (num_queue_count <= 0)
438 			panic("aio_free_entry: system wide queue size <= 0");
439 
440 		if (lj) {
441 			lj->lioj_queue_count--;
442 			if (aiocbe->jobflags & AIOCBLIST_DONE)
443 				lj->lioj_queue_finished_count--;
444 		}
445 		ki->kaio_queue_count--;
446 		if (aiocbe->jobflags & AIOCBLIST_DONE)
447 			ki->kaio_queue_finished_count--;
448 		num_queue_count--;
449 	} else {
450 		if (lj) {
451 			lj->lioj_buffer_count--;
452 			if (aiocbe->jobflags & AIOCBLIST_DONE)
453 				lj->lioj_buffer_finished_count--;
454 		}
455 		if (aiocbe->jobflags & AIOCBLIST_DONE)
456 			ki->kaio_buffer_finished_count--;
457 		ki->kaio_buffer_count--;
458 		num_buf_aio--;
459 	}
460 
461 	/* aiocbe is going away, we need to destroy any knotes */
462 	/* XXXKSE Note the thread here is used to eventually find the
463 	 * owning process again, but it is also used to do a fo_close
464 	 * and that requires the thread. (but does it require the
465 	 * OWNING thread? (or maybe the running thread?)
466 	 * There is a semantic problem here...
467 	 */
468 	knote_remove(FIRST_THREAD_IN_PROC(p), &aiocbe->klist); /* XXXKSE */
469 
470 	if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
471 	    && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
472 		ki->kaio_flags &= ~KAIO_WAKEUP;
473 		wakeup(p);
474 	}
475 
476 	if (aiocbe->jobstate == JOBST_JOBQBUF) {
477 		if ((error = aio_fphysio(aiocbe)) != 0)
478 			return error;
479 		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
480 			panic("aio_free_entry: invalid physio finish-up state");
481 		s = splbio();
482 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
483 		splx(s);
484 	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
485 		s = splnet();
486 		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
487 		TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
488 		splx(s);
489 	} else if (aiocbe->jobstate == JOBST_JOBFINISHED)
490 		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
491 	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
492 		s = splbio();
493 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
494 		splx(s);
495 		if (aiocbe->bp) {
496 			vunmapbuf(aiocbe->bp);
497 			relpbuf(aiocbe->bp, NULL);
498 			aiocbe->bp = NULL;
499 		}
500 	}
501 	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
502 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
503 		uma_zfree(aiolio_zone, lj);
504 	}
505 	aiocbe->jobstate = JOBST_NULL;
506 	untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
507 	fdrop(aiocbe->fd_file, curthread);
508 	uma_zfree(aiocb_zone, aiocbe);
509 	return 0;
510 }
511 
512 /*
513  * Rundown the jobs for a given process.
514  */
515 static void
516 aio_proc_rundown(struct proc *p)
517 {
518 	int s;
519 	struct kaioinfo *ki;
520 	struct aio_liojob *lj, *ljn;
521 	struct aiocblist *aiocbe, *aiocbn;
522 	struct file *fp;
523 	struct socket *so;
524 
525 	ki = p->p_aioinfo;
526 	if (ki == NULL)
527 		return;
528 
529 	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
530 	while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
531 	    ki->kaio_buffer_finished_count)) {
532 		ki->kaio_flags |= KAIO_RUNDOWN;
533 		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
534 			break;
535 	}
536 
537 	/*
538 	 * Move any aio ops that are waiting on socket I/O to the normal job
539 	 * queues so they are cleaned up with any others.
540 	 */
541 	s = splnet();
542 	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
543 	    aiocbn) {
544 		aiocbn = TAILQ_NEXT(aiocbe, plist);
545 		fp = aiocbe->fd_file;
546 		if (fp != NULL) {
547 			so = (struct socket *)fp->f_data;
548 			TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
549 			if (TAILQ_EMPTY(&so->so_aiojobq)) {
550 				so->so_snd.sb_flags &= ~SB_AIO;
551 				so->so_rcv.sb_flags &= ~SB_AIO;
552 			}
553 		}
554 		TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
555 		TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
556 		TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
557 	}
558 	splx(s);
559 
560 restart1:
561 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
562 		aiocbn = TAILQ_NEXT(aiocbe, plist);
563 		if (aio_free_entry(aiocbe))
564 			goto restart1;
565 	}
566 
567 restart2:
568 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
569 	    aiocbn) {
570 		aiocbn = TAILQ_NEXT(aiocbe, plist);
571 		if (aio_free_entry(aiocbe))
572 			goto restart2;
573 	}
574 
575 /*
576  * Note the use of lots of splbio here, trying to avoid splbio for long chains
577  * of I/O.  Probably unnecessary.
578  */
579 restart3:
580 	s = splbio();
581 	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
582 		ki->kaio_flags |= KAIO_WAKEUP;
583 		tsleep(p, PRIBIO, "aioprn", 0);
584 		splx(s);
585 		goto restart3;
586 	}
587 	splx(s);
588 
589 restart4:
590 	s = splbio();
591 	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
592 		aiocbn = TAILQ_NEXT(aiocbe, plist);
593 		if (aio_free_entry(aiocbe)) {
594 			splx(s);
595 			goto restart4;
596 		}
597 	}
598 	splx(s);
599 
600         /*
601          * If we've slept, jobs might have moved from one queue to another.
602          * Retry rundown if we didn't manage to empty the queues.
603          */
604         if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
605 	    TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
606 	    TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
607 	    TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
608 		goto restart1;
609 
610 	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
611 		ljn = TAILQ_NEXT(lj, lioj_list);
612 		if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
613 		    0)) {
614 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
615 			uma_zfree(aiolio_zone, lj);
616 		} else {
617 #ifdef DIAGNOSTIC
618 			printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
619 			    "QF:%d\n", lj->lioj_buffer_count,
620 			    lj->lioj_buffer_finished_count,
621 			    lj->lioj_queue_count,
622 			    lj->lioj_queue_finished_count);
623 #endif
624 		}
625 	}
626 
627 	uma_zfree(kaio_zone, ki);
628 	p->p_aioinfo = NULL;
629 }
630 
631 /*
632  * Select a job to run (called by an AIO daemon).
633  */
634 static struct aiocblist *
635 aio_selectjob(struct aiothreadlist *aiop)
636 {
637 	int s;
638 	struct aiocblist *aiocbe;
639 	struct kaioinfo *ki;
640 	struct proc *userp;
641 
642 	s = splnet();
643 	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
644 	    TAILQ_NEXT(aiocbe, list)) {
645 		userp = aiocbe->userproc;
646 		ki = userp->p_aioinfo;
647 
648 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
649 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
650 			splx(s);
651 			return aiocbe;
652 		}
653 	}
654 	splx(s);
655 
656 	return NULL;
657 }
658 
659 /*
660  * The AIO processing activity.  This is the code that does the I/O request for
661  * the non-physio version of the operations.  The normal vn operations are used,
662  * and this code should work in all instances for every type of file, including
663  * pipes, sockets, fifos, and regular files.
664  */
665 static void
666 aio_process(struct aiocblist *aiocbe)
667 {
668 	struct thread *td;
669 	struct proc *mycp;
670 	struct aiocb *cb;
671 	struct file *fp;
672 	struct uio auio;
673 	struct iovec aiov;
674 	int cnt;
675 	int error;
676 	int oublock_st, oublock_end;
677 	int inblock_st, inblock_end;
678 
679 	td = curthread;
680 	mycp = td->td_proc;
681 	cb = &aiocbe->uaiocb;
682 	fp = aiocbe->fd_file;
683 
684 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
685 	aiov.iov_len = cb->aio_nbytes;
686 
687 	auio.uio_iov = &aiov;
688 	auio.uio_iovcnt = 1;
689 	auio.uio_offset = cb->aio_offset;
690 	auio.uio_resid = cb->aio_nbytes;
691 	cnt = cb->aio_nbytes;
692 	auio.uio_segflg = UIO_USERSPACE;
693 	auio.uio_td = td;
694 
695 	inblock_st = mycp->p_stats->p_ru.ru_inblock;
696 	oublock_st = mycp->p_stats->p_ru.ru_oublock;
697 	/*
698 	 * _aio_aqueue() acquires a reference to the file that is
699 	 * released in aio_free_entry().
700 	 */
701 	if (cb->aio_lio_opcode == LIO_READ) {
702 		auio.uio_rw = UIO_READ;
703 		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
704 	} else {
705 		auio.uio_rw = UIO_WRITE;
706 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
707 	}
708 	inblock_end = mycp->p_stats->p_ru.ru_inblock;
709 	oublock_end = mycp->p_stats->p_ru.ru_oublock;
710 
711 	aiocbe->inputcharge = inblock_end - inblock_st;
712 	aiocbe->outputcharge = oublock_end - oublock_st;
713 
714 	if ((error) && (auio.uio_resid != cnt)) {
715 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
716 			error = 0;
717 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
718 			PROC_LOCK(aiocbe->userproc);
719 			psignal(aiocbe->userproc, SIGPIPE);
720 			PROC_UNLOCK(aiocbe->userproc);
721 		}
722 	}
723 
724 	cnt -= auio.uio_resid;
725 	cb->_aiocb_private.error = error;
726 	cb->_aiocb_private.status = cnt;
727 }
728 
729 /*
730  * The AIO daemon, most of the actual work is done in aio_process,
731  * but the setup (and address space mgmt) is done in this routine.
732  */
733 static void
734 aio_daemon(void *uproc)
735 {
736 	int s;
737 	struct aio_liojob *lj;
738 	struct aiocb *cb;
739 	struct aiocblist *aiocbe;
740 	struct aiothreadlist *aiop;
741 	struct kaioinfo *ki;
742 	struct proc *curcp, *mycp, *userp;
743 	struct vmspace *myvm, *tmpvm;
744 	struct thread *td = curthread;
745 	struct pgrp *newpgrp;
746 	struct session *newsess;
747 
748 	mtx_lock(&Giant);
749 	/*
750 	 * Local copies of curproc (cp) and vmspace (myvm)
751 	 */
752 	mycp = td->td_proc;
753 	myvm = mycp->p_vmspace;
754 
755 	if (mycp->p_textvp) {
756 		vrele(mycp->p_textvp);
757 		mycp->p_textvp = NULL;
758 	}
759 
760 	/*
761 	 * Allocate and ready the aio control info.  There is one aiop structure
762 	 * per daemon.
763 	 */
764 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
765 	aiop->aiothread = td;
766 	aiop->aiothreadflags |= AIOP_FREE;
767 
768 	s = splnet();
769 
770 	/*
771 	 * Place thread (lightweight process) onto the AIO free thread list.
772 	 */
773 	if (TAILQ_EMPTY(&aio_freeproc))
774 		wakeup(&aio_freeproc);
775 	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
776 
777 	splx(s);
778 
779 	/*
780 	 * Get rid of our current filedescriptors.  AIOD's don't need any
781 	 * filedescriptors, except as temporarily inherited from the client.
782 	 */
783 	fdfree(td);
784 
785 	mtx_unlock(&Giant);
786 	/* The daemon resides in its own pgrp. */
787 	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP,
788 		M_WAITOK | M_ZERO);
789 	MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION,
790 		M_WAITOK | M_ZERO);
791 
792 	sx_xlock(&proctree_lock);
793 	enterpgrp(mycp, mycp->p_pid, newpgrp, newsess);
794 	sx_xunlock(&proctree_lock);
795 	mtx_lock(&Giant);
796 
797 	/* Mark special process type. */
798 	mycp->p_flag |= P_SYSTEM;
799 
800 	/*
801 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
802 	 * and creating too many daemons.)
803 	 */
804 	wakeup(mycp);
805 
806 	for (;;) {
807 		/*
808 		 * curcp is the current daemon process context.
809 		 * userp is the current user process context.
810 		 */
811 		curcp = mycp;
812 
813 		/*
814 		 * Take daemon off of free queue
815 		 */
816 		if (aiop->aiothreadflags & AIOP_FREE) {
817 			s = splnet();
818 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
819 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
820 			aiop->aiothreadflags &= ~AIOP_FREE;
821 			splx(s);
822 		}
823 		aiop->aiothreadflags &= ~AIOP_SCHED;
824 
825 		/*
826 		 * Check for jobs.
827 		 */
828 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
829 			cb = &aiocbe->uaiocb;
830 			userp = aiocbe->userproc;
831 
832 			aiocbe->jobstate = JOBST_JOBRUNNING;
833 
834 			/*
835 			 * Connect to process address space for user program.
836 			 */
837 			if (userp != curcp) {
838 				/*
839 				 * Save the current address space that we are
840 				 * connected to.
841 				 */
842 				tmpvm = mycp->p_vmspace;
843 
844 				/*
845 				 * Point to the new user address space, and
846 				 * refer to it.
847 				 */
848 				mycp->p_vmspace = userp->p_vmspace;
849 				mycp->p_vmspace->vm_refcnt++;
850 
851 				/* Activate the new mapping. */
852 				pmap_activate(FIRST_THREAD_IN_PROC(mycp));
853 
854 				/*
855 				 * If the old address space wasn't the daemons
856 				 * own address space, then we need to remove the
857 				 * daemon's reference from the other process
858 				 * that it was acting on behalf of.
859 				 */
860 				if (tmpvm != myvm) {
861 					vmspace_free(tmpvm);
862 				}
863 				curcp = userp;
864 			}
865 
866 			ki = userp->p_aioinfo;
867 			lj = aiocbe->lio;
868 
869 			/* Account for currently active jobs. */
870 			ki->kaio_active_count++;
871 
872 			/* Do the I/O function. */
873 			aio_process(aiocbe);
874 
875 			/* Decrement the active job count. */
876 			ki->kaio_active_count--;
877 
878 			/*
879 			 * Increment the completion count for wakeup/signal
880 			 * comparisons.
881 			 */
882 			aiocbe->jobflags |= AIOCBLIST_DONE;
883 			ki->kaio_queue_finished_count++;
884 			if (lj)
885 				lj->lioj_queue_finished_count++;
886 			if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
887 			    & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
888 				ki->kaio_flags &= ~KAIO_WAKEUP;
889 				wakeup(userp);
890 			}
891 
892 			s = splbio();
893 			if (lj && (lj->lioj_flags &
894 			    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
895 				if ((lj->lioj_queue_finished_count ==
896 				    lj->lioj_queue_count) &&
897 				    (lj->lioj_buffer_finished_count ==
898 				    lj->lioj_buffer_count)) {
899 					PROC_LOCK(userp);
900 					psignal(userp,
901 					    lj->lioj_signal.sigev_signo);
902 					PROC_UNLOCK(userp);
903 					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
904 				}
905 			}
906 			splx(s);
907 
908 			aiocbe->jobstate = JOBST_JOBFINISHED;
909 
910 			s = splnet();
911 			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
912 			TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist);
913 			splx(s);
914 			KNOTE(&aiocbe->klist, 0);
915 
916 			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
917 				wakeup(aiocbe);
918 				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
919 			}
920 
921 			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
922 				PROC_LOCK(userp);
923 				psignal(userp, cb->aio_sigevent.sigev_signo);
924 				PROC_UNLOCK(userp);
925 			}
926 		}
927 
928 		/*
929 		 * Disconnect from user address space.
930 		 */
931 		if (curcp != mycp) {
932 			/* Get the user address space to disconnect from. */
933 			tmpvm = mycp->p_vmspace;
934 
935 			/* Get original address space for daemon. */
936 			mycp->p_vmspace = myvm;
937 
938 			/* Activate the daemon's address space. */
939 			pmap_activate(FIRST_THREAD_IN_PROC(mycp));
940 #ifdef DIAGNOSTIC
941 			if (tmpvm == myvm) {
942 				printf("AIOD: vmspace problem -- %d\n",
943 				    mycp->p_pid);
944 			}
945 #endif
946 			/* Remove our vmspace reference. */
947 			vmspace_free(tmpvm);
948 
949 			curcp = mycp;
950 		}
951 
952 		/*
953 		 * If we are the first to be put onto the free queue, wakeup
954 		 * anyone waiting for a daemon.
955 		 */
956 		s = splnet();
957 		TAILQ_REMOVE(&aio_activeproc, aiop, list);
958 		if (TAILQ_EMPTY(&aio_freeproc))
959 			wakeup(&aio_freeproc);
960 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
961 		aiop->aiothreadflags |= AIOP_FREE;
962 		splx(s);
963 
964 		/*
965 		 * If daemon is inactive for a long time, allow it to exit,
966 		 * thereby freeing resources.
967 		 */
968 		if ((aiop->aiothreadflags & AIOP_SCHED) == 0 &&
969 		    tsleep(aiop->aiothread, PRIBIO, "aiordy", aiod_lifetime)) {
970 			s = splnet();
971 			if (TAILQ_EMPTY(&aio_jobs)) {
972 				if ((aiop->aiothreadflags & AIOP_FREE) &&
973 				    (num_aio_procs > target_aio_procs)) {
974 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
975 					splx(s);
976 					uma_zfree(aiop_zone, aiop);
977 					num_aio_procs--;
978 #ifdef DIAGNOSTIC
979 					if (mycp->p_vmspace->vm_refcnt <= 1) {
980 						printf("AIOD: bad vm refcnt for"
981 						    " exiting daemon: %d\n",
982 						    mycp->p_vmspace->vm_refcnt);
983 					}
984 #endif
985 					kthread_exit(0);
986 				}
987 			}
988 			splx(s);
989 		}
990 	}
991 }
992 
993 /*
994  * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
995  * AIO daemon modifies its environment itself.
996  */
997 static int
998 aio_newproc()
999 {
1000 	int error;
1001 	struct proc *p;
1002 
1003 	error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, 0, "aiod%d",
1004 			       num_aio_procs);
1005 	if (error)
1006 		return error;
1007 
1008 	/*
1009 	 * Wait until daemon is started, but continue on just in case to
1010 	 * handle error conditions.
1011 	 */
1012 	error = tsleep(p, PZERO, "aiosta", aiod_timeout);
1013 
1014 	num_aio_procs++;
1015 
1016 	return error;
1017 }
1018 
1019 /*
1020  * Try the high-performance, low-overhead physio method for eligible
1021  * VCHR devices.  This method doesn't use an aio helper thread, and
1022  * thus has very low overhead.
1023  *
1024  * Assumes that the caller, _aio_aqueue(), has incremented the file
1025  * structure's reference count, preventing its deallocation for the
1026  * duration of this call.
1027  */
1028 static int
1029 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1030 {
1031 	int error;
1032 	struct aiocb *cb;
1033 	struct file *fp;
1034 	struct buf *bp;
1035 	struct vnode *vp;
1036 	struct kaioinfo *ki;
1037 	struct aio_liojob *lj;
1038 	int s;
1039 	int notify;
1040 
1041 	cb = &aiocbe->uaiocb;
1042 	fp = aiocbe->fd_file;
1043 
1044 	if (fp->f_type != DTYPE_VNODE)
1045 		return (-1);
1046 
1047 	vp = (struct vnode *)fp->f_data;
1048 
1049 	/*
1050 	 * If its not a disk, we don't want to return a positive error.
1051 	 * It causes the aio code to not fall through to try the thread
1052 	 * way when you're talking to a regular file.
1053 	 */
1054 	if (!vn_isdisk(vp, &error)) {
1055 		if (error == ENOTBLK)
1056 			return (-1);
1057 		else
1058 			return (error);
1059 	}
1060 
1061  	if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
1062 		return (-1);
1063 
1064 	if (cb->aio_nbytes >
1065 	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1066 		return (-1);
1067 
1068 	ki = p->p_aioinfo;
1069 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1070 		return (-1);
1071 
1072 	ki->kaio_buffer_count++;
1073 
1074 	lj = aiocbe->lio;
1075 	if (lj)
1076 		lj->lioj_buffer_count++;
1077 
1078 	/* Create and build a buffer header for a transfer. */
1079 	bp = (struct buf *)getpbuf(NULL);
1080 	BUF_KERNPROC(bp);
1081 
1082 	/*
1083 	 * Get a copy of the kva from the physical buffer.
1084 	 */
1085 	bp->b_caller1 = p;
1086 	bp->b_dev = vp->v_rdev;
1087 	error = bp->b_error = 0;
1088 
1089 	bp->b_bcount = cb->aio_nbytes;
1090 	bp->b_bufsize = cb->aio_nbytes;
1091 	bp->b_flags = B_PHYS;
1092 	bp->b_iodone = aio_physwakeup;
1093 	bp->b_saveaddr = bp->b_data;
1094 	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1095 	bp->b_blkno = btodb(cb->aio_offset);
1096 
1097 	if (cb->aio_lio_opcode == LIO_WRITE) {
1098 		bp->b_iocmd = BIO_WRITE;
1099 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1100 			error = EFAULT;
1101 			goto doerror;
1102 		}
1103 	} else {
1104 		bp->b_iocmd = BIO_READ;
1105 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1106 			error = EFAULT;
1107 			goto doerror;
1108 		}
1109 	}
1110 
1111 	/* Bring buffer into kernel space. */
1112 	vmapbuf(bp);
1113 
1114 	s = splbio();
1115 	aiocbe->bp = bp;
1116 	bp->b_spc = (void *)aiocbe;
1117 	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1118 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1119 	aiocbe->jobstate = JOBST_JOBQBUF;
1120 	cb->_aiocb_private.status = cb->aio_nbytes;
1121 	num_buf_aio++;
1122 	bp->b_error = 0;
1123 
1124 	splx(s);
1125 
1126 	/* Perform transfer. */
1127 	DEV_STRATEGY(bp, 0);
1128 
1129 	notify = 0;
1130 	s = splbio();
1131 
1132 	/*
1133 	 * If we had an error invoking the request, or an error in processing
1134 	 * the request before we have returned, we process it as an error in
1135 	 * transfer.  Note that such an I/O error is not indicated immediately,
1136 	 * but is returned using the aio_error mechanism.  In this case,
1137 	 * aio_suspend will return immediately.
1138 	 */
1139 	if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1140 		struct aiocb *job = aiocbe->uuaiocb;
1141 
1142 		aiocbe->uaiocb._aiocb_private.status = 0;
1143 		suword(&job->_aiocb_private.status, 0);
1144 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1145 		suword(&job->_aiocb_private.error, bp->b_error);
1146 
1147 		ki->kaio_buffer_finished_count++;
1148 
1149 		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1150 			aiocbe->jobstate = JOBST_JOBBFINISHED;
1151 			aiocbe->jobflags |= AIOCBLIST_DONE;
1152 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1153 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1154 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1155 			notify = 1;
1156 		}
1157 	}
1158 	splx(s);
1159 	if (notify)
1160 		KNOTE(&aiocbe->klist, 0);
1161 	return 0;
1162 
1163 doerror:
1164 	ki->kaio_buffer_count--;
1165 	if (lj)
1166 		lj->lioj_buffer_count--;
1167 	aiocbe->bp = NULL;
1168 	relpbuf(bp, NULL);
1169 	return error;
1170 }
1171 
1172 /*
1173  * This waits/tests physio completion.
1174  */
1175 static int
1176 aio_fphysio(struct aiocblist *iocb)
1177 {
1178 	int s;
1179 	struct buf *bp;
1180 	int error;
1181 
1182 	bp = iocb->bp;
1183 
1184 	s = splbio();
1185 	while ((bp->b_flags & B_DONE) == 0) {
1186 		if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
1187 			if ((bp->b_flags & B_DONE) == 0) {
1188 				splx(s);
1189 				return EINPROGRESS;
1190 			} else
1191 				break;
1192 		}
1193 	}
1194 	splx(s);
1195 
1196 	/* Release mapping into kernel space. */
1197 	vunmapbuf(bp);
1198 	iocb->bp = 0;
1199 
1200 	error = 0;
1201 
1202 	/* Check for an error. */
1203 	if (bp->b_ioflags & BIO_ERROR)
1204 		error = bp->b_error;
1205 
1206 	relpbuf(bp, NULL);
1207 	return (error);
1208 }
1209 
1210 /*
1211  * Wake up aio requests that may be serviceable now.
1212  */
1213 static void
1214 aio_swake_cb(struct socket *so, struct sockbuf *sb)
1215 {
1216 	struct aiocblist *cb,*cbn;
1217 	struct proc *p;
1218 	struct kaioinfo *ki = NULL;
1219 	int opcode, wakecount = 0;
1220 	struct aiothreadlist *aiop;
1221 
1222 	if (sb == &so->so_snd) {
1223 		opcode = LIO_WRITE;
1224 		so->so_snd.sb_flags &= ~SB_AIO;
1225 	} else {
1226 		opcode = LIO_READ;
1227 		so->so_rcv.sb_flags &= ~SB_AIO;
1228 	}
1229 
1230 	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1231 		cbn = TAILQ_NEXT(cb, list);
1232 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1233 			p = cb->userproc;
1234 			ki = p->p_aioinfo;
1235 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1236 			TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1237 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1238 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1239 			wakecount++;
1240 			if (cb->jobstate != JOBST_JOBQGLOBAL)
1241 				panic("invalid queue value");
1242 		}
1243 	}
1244 
1245 	while (wakecount--) {
1246 		if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1247 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1248 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1249 			aiop->aiothreadflags &= ~AIOP_FREE;
1250 			wakeup(aiop->aiothread);
1251 		}
1252 	}
1253 }
1254 
1255 /*
1256  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1257  * technique is done in this code.
1258  */
1259 static int
1260 _aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
1261 {
1262 	struct proc *p = td->td_proc;
1263 	struct filedesc *fdp;
1264 	struct file *fp;
1265 	unsigned int fd;
1266 	struct socket *so;
1267 	int s;
1268 	int error;
1269 	int opcode, user_opcode;
1270 	struct aiocblist *aiocbe;
1271 	struct aiothreadlist *aiop;
1272 	struct kaioinfo *ki;
1273 	struct kevent kev;
1274 	struct kqueue *kq;
1275 	struct file *kq_fp;
1276 
1277 	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK);
1278 	aiocbe->inputcharge = 0;
1279 	aiocbe->outputcharge = 0;
1280 	callout_handle_init(&aiocbe->timeouthandle);
1281 	SLIST_INIT(&aiocbe->klist);
1282 
1283 	suword(&job->_aiocb_private.status, -1);
1284 	suword(&job->_aiocb_private.error, 0);
1285 	suword(&job->_aiocb_private.kernelinfo, -1);
1286 
1287 	error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1288 	if (error) {
1289 		suword(&job->_aiocb_private.error, error);
1290 		uma_zfree(aiocb_zone, aiocbe);
1291 		return error;
1292 	}
1293 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1294 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1295 		uma_zfree(aiocb_zone, aiocbe);
1296 		return EINVAL;
1297 	}
1298 
1299 	/* Save userspace address of the job info. */
1300 	aiocbe->uuaiocb = job;
1301 
1302 	/* Get the opcode. */
1303 	user_opcode = aiocbe->uaiocb.aio_lio_opcode;
1304 	if (type != LIO_NOP)
1305 		aiocbe->uaiocb.aio_lio_opcode = type;
1306 	opcode = aiocbe->uaiocb.aio_lio_opcode;
1307 
1308 	/* Get the fd info for process. */
1309 	fdp = p->p_fd;
1310 
1311 	/*
1312 	 * Range check file descriptor.
1313 	 */
1314 	fd = aiocbe->uaiocb.aio_fildes;
1315 	if (fd >= fdp->fd_nfiles) {
1316 		uma_zfree(aiocb_zone, aiocbe);
1317 		if (type == 0)
1318 			suword(&job->_aiocb_private.error, EBADF);
1319 		return EBADF;
1320 	}
1321 
1322 	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1323 	if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1324 	    0))) {
1325 		uma_zfree(aiocb_zone, aiocbe);
1326 		if (type == 0)
1327 			suword(&job->_aiocb_private.error, EBADF);
1328 		return EBADF;
1329 	}
1330 	fhold(fp);
1331 
1332 	if (aiocbe->uaiocb.aio_offset == -1LL) {
1333 		error = EINVAL;
1334 		goto aqueue_fail;
1335 	}
1336 	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1337 	if (error) {
1338 		error = EINVAL;
1339 		goto aqueue_fail;
1340 	}
1341 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1342 	if (jobrefid == LONG_MAX)
1343 		jobrefid = 1;
1344 	else
1345 		jobrefid++;
1346 
1347 	if (opcode == LIO_NOP) {
1348 		fdrop(fp, td);
1349 		uma_zfree(aiocb_zone, aiocbe);
1350 		if (type == 0) {
1351 			suword(&job->_aiocb_private.error, 0);
1352 			suword(&job->_aiocb_private.status, 0);
1353 			suword(&job->_aiocb_private.kernelinfo, 0);
1354 		}
1355 		return 0;
1356 	}
1357 	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1358 		if (type == 0)
1359 			suword(&job->_aiocb_private.status, 0);
1360 		error = EINVAL;
1361 		goto aqueue_fail;
1362 	}
1363 
1364 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1365 		kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1366 		kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1367 	}
1368 	else {
1369 		/*
1370 		 * This method for requesting kevent-based notification won't
1371 		 * work on the alpha, since we're passing in a pointer
1372 		 * via aio_lio_opcode, which is an int.  Use the SIGEV_KEVENT-
1373 		 * based method instead.
1374 		 */
1375 		if (user_opcode == LIO_NOP || user_opcode == LIO_READ ||
1376 		    user_opcode == LIO_WRITE)
1377 			goto no_kqueue;
1378 
1379 		error = copyin((struct kevent *)(uintptr_t)user_opcode,
1380 		    &kev, sizeof(kev));
1381 		if (error)
1382 			goto aqueue_fail;
1383 	}
1384 	if ((u_int)kev.ident >= fdp->fd_nfiles ||
1385 	    (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1386 	    (kq_fp->f_type != DTYPE_KQUEUE)) {
1387 		error = EBADF;
1388 		goto aqueue_fail;
1389 	}
1390 	kq = (struct kqueue *)kq_fp->f_data;
1391 	kev.ident = (uintptr_t)aiocbe->uuaiocb;
1392 	kev.filter = EVFILT_AIO;
1393 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1394 	kev.data = (intptr_t)aiocbe;
1395 	error = kqueue_register(kq, &kev, td);
1396 aqueue_fail:
1397 	if (error) {
1398 		fdrop(fp, td);
1399 		uma_zfree(aiocb_zone, aiocbe);
1400 		if (type == 0)
1401 			suword(&job->_aiocb_private.error, error);
1402 		goto done;
1403 	}
1404 no_kqueue:
1405 
1406 	suword(&job->_aiocb_private.error, EINPROGRESS);
1407 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1408 	aiocbe->userproc = p;
1409 	aiocbe->jobflags = 0;
1410 	aiocbe->lio = lj;
1411 	ki = p->p_aioinfo;
1412 
1413 	if (fp->f_type == DTYPE_SOCKET) {
1414 		/*
1415 		 * Alternate queueing for socket ops: Reach down into the
1416 		 * descriptor to get the socket data.  Then check to see if the
1417 		 * socket is ready to be read or written (based on the requested
1418 		 * operation).
1419 		 *
1420 		 * If it is not ready for io, then queue the aiocbe on the
1421 		 * socket, and set the flags so we get a call when sbnotify()
1422 		 * happens.
1423 		 */
1424 		so = (struct socket *)fp->f_data;
1425 		s = splnet();
1426 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1427 		    LIO_WRITE) && (!sowriteable(so)))) {
1428 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1429 			TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1430 			if (opcode == LIO_READ)
1431 				so->so_rcv.sb_flags |= SB_AIO;
1432 			else
1433 				so->so_snd.sb_flags |= SB_AIO;
1434 			aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1435 			ki->kaio_queue_count++;
1436 			num_queue_count++;
1437 			splx(s);
1438 			error = 0;
1439 			goto done;
1440 		}
1441 		splx(s);
1442 	}
1443 
1444 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1445 		goto done;
1446 	if (error > 0) {
1447 		suword(&job->_aiocb_private.status, 0);
1448 		aiocbe->uaiocb._aiocb_private.error = error;
1449 		suword(&job->_aiocb_private.error, error);
1450 		goto done;
1451 	}
1452 
1453 	/* No buffer for daemon I/O. */
1454 	aiocbe->bp = NULL;
1455 
1456 	ki->kaio_queue_count++;
1457 	if (lj)
1458 		lj->lioj_queue_count++;
1459 	s = splnet();
1460 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1461 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1462 	splx(s);
1463 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1464 
1465 	num_queue_count++;
1466 	error = 0;
1467 
1468 	/*
1469 	 * If we don't have a free AIO process, and we are below our quota, then
1470 	 * start one.  Otherwise, depend on the subsequent I/O completions to
1471 	 * pick-up this job.  If we don't sucessfully create the new process
1472 	 * (thread) due to resource issues, we return an error for now (EAGAIN),
1473 	 * which is likely not the correct thing to do.
1474 	 */
1475 	s = splnet();
1476 retryproc:
1477 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1478 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1479 		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1480 		aiop->aiothreadflags &= ~AIOP_FREE;
1481 		wakeup(aiop->aiothread);
1482 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1483 	    ((ki->kaio_active_count + num_aio_resv_start) <
1484 	    ki->kaio_maxactive_count)) {
1485 		num_aio_resv_start++;
1486 		if ((error = aio_newproc()) == 0) {
1487 			num_aio_resv_start--;
1488 			goto retryproc;
1489 		}
1490 		num_aio_resv_start--;
1491 	}
1492 	splx(s);
1493 done:
1494 	return error;
1495 }
1496 
1497 /*
1498  * This routine queues an AIO request, checking for quotas.
1499  */
1500 static int
1501 aio_aqueue(struct thread *td, struct aiocb *job, int type)
1502 {
1503 	struct proc *p = td->td_proc;
1504 	struct kaioinfo *ki;
1505 
1506 	if (p->p_aioinfo == NULL)
1507 		aio_init_aioinfo(p);
1508 
1509 	if (num_queue_count >= max_queue_count)
1510 		return EAGAIN;
1511 
1512 	ki = p->p_aioinfo;
1513 	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1514 		return EAGAIN;
1515 
1516 	return _aio_aqueue(td, job, NULL, type);
1517 }
1518 
1519 /*
1520  * Support the aio_return system call, as a side-effect, kernel resources are
1521  * released.
1522  */
1523 int
1524 aio_return(struct thread *td, struct aio_return_args *uap)
1525 {
1526 	struct proc *p = td->td_proc;
1527 	int s;
1528 	long jobref;
1529 	struct aiocblist *cb, *ncb;
1530 	struct aiocb *ujob;
1531 	struct kaioinfo *ki;
1532 
1533 	ujob = uap->aiocbp;
1534 	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1535 	if (jobref == -1 || jobref == 0)
1536 		return EINVAL;
1537 
1538 	ki = p->p_aioinfo;
1539 	if (ki == NULL)
1540 		return EINVAL;
1541 	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1542 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1543 		    jobref) {
1544 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1545 				p->p_stats->p_ru.ru_oublock +=
1546 				    cb->outputcharge;
1547 				cb->outputcharge = 0;
1548 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1549 				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
1550 				cb->inputcharge = 0;
1551 			}
1552 			goto done;
1553 		}
1554 	}
1555 	s = splbio();
1556 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1557 		ncb = TAILQ_NEXT(cb, plist);
1558 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1559 		    == jobref) {
1560 			break;
1561 		}
1562 	}
1563 	splx(s);
1564  done:
1565 	if (cb != NULL) {
1566 		if (ujob == cb->uuaiocb) {
1567 			td->td_retval[0] =
1568 			    cb->uaiocb._aiocb_private.status;
1569 		} else
1570 			td->td_retval[0] = EFAULT;
1571 		aio_free_entry(cb);
1572 		return (0);
1573 	}
1574 	return (EINVAL);
1575 }
1576 
1577 /*
1578  * Allow a process to wakeup when any of the I/O requests are completed.
1579  */
1580 int
1581 aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1582 {
1583 	struct proc *p = td->td_proc;
1584 	struct timeval atv;
1585 	struct timespec ts;
1586 	struct aiocb *const *cbptr, *cbp;
1587 	struct kaioinfo *ki;
1588 	struct aiocblist *cb;
1589 	int i;
1590 	int njoblist;
1591 	int error, s, timo;
1592 	long *ijoblist;
1593 	struct aiocb **ujoblist;
1594 
1595 	if (uap->nent > AIO_LISTIO_MAX)
1596 		return EINVAL;
1597 
1598 	timo = 0;
1599 	if (uap->timeout) {
1600 		/* Get timespec struct. */
1601 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1602 			return error;
1603 
1604 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1605 			return (EINVAL);
1606 
1607 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
1608 		if (itimerfix(&atv))
1609 			return (EINVAL);
1610 		timo = tvtohz(&atv);
1611 	}
1612 
1613 	ki = p->p_aioinfo;
1614 	if (ki == NULL)
1615 		return EAGAIN;
1616 
1617 	njoblist = 0;
1618 	ijoblist = uma_zalloc(aiol_zone, M_WAITOK);
1619 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
1620 	cbptr = uap->aiocbp;
1621 
1622 	for (i = 0; i < uap->nent; i++) {
1623 		cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
1624 		if (cbp == 0)
1625 			continue;
1626 		ujoblist[njoblist] = cbp;
1627 		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1628 		njoblist++;
1629 	}
1630 
1631 	if (njoblist == 0) {
1632 		uma_zfree(aiol_zone, ijoblist);
1633 		uma_zfree(aiol_zone, ujoblist);
1634 		return 0;
1635 	}
1636 
1637 	error = 0;
1638 	for (;;) {
1639 		TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1640 			for (i = 0; i < njoblist; i++) {
1641 				if (((intptr_t)
1642 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1643 				    ijoblist[i]) {
1644 					if (ujoblist[i] != cb->uuaiocb)
1645 						error = EINVAL;
1646 					uma_zfree(aiol_zone, ijoblist);
1647 					uma_zfree(aiol_zone, ujoblist);
1648 					return error;
1649 				}
1650 			}
1651 		}
1652 
1653 		s = splbio();
1654 		for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1655 		    TAILQ_NEXT(cb, plist)) {
1656 			for (i = 0; i < njoblist; i++) {
1657 				if (((intptr_t)
1658 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1659 				    ijoblist[i]) {
1660 					splx(s);
1661 					if (ujoblist[i] != cb->uuaiocb)
1662 						error = EINVAL;
1663 					uma_zfree(aiol_zone, ijoblist);
1664 					uma_zfree(aiol_zone, ujoblist);
1665 					return error;
1666 				}
1667 			}
1668 		}
1669 
1670 		ki->kaio_flags |= KAIO_WAKEUP;
1671 		error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1672 		splx(s);
1673 
1674 		if (error == ERESTART || error == EINTR) {
1675 			uma_zfree(aiol_zone, ijoblist);
1676 			uma_zfree(aiol_zone, ujoblist);
1677 			return EINTR;
1678 		} else if (error == EWOULDBLOCK) {
1679 			uma_zfree(aiol_zone, ijoblist);
1680 			uma_zfree(aiol_zone, ujoblist);
1681 			return EAGAIN;
1682 		}
1683 	}
1684 
1685 /* NOTREACHED */
1686 	return EINVAL;
1687 }
1688 
1689 /*
1690  * aio_cancel cancels any non-physio aio operations not currently in
1691  * progress.
1692  */
1693 int
1694 aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1695 {
1696 	struct proc *p = td->td_proc;
1697 	struct kaioinfo *ki;
1698 	struct aiocblist *cbe, *cbn;
1699 	struct file *fp;
1700 	struct filedesc *fdp;
1701 	struct socket *so;
1702 	struct proc *po;
1703 	int s,error;
1704 	int cancelled=0;
1705 	int notcancelled=0;
1706 	struct vnode *vp;
1707 
1708 	fdp = p->p_fd;
1709 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
1710 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
1711 		return (EBADF);
1712 
1713         if (fp->f_type == DTYPE_VNODE) {
1714 		vp = (struct vnode *)fp->f_data;
1715 
1716 		if (vn_isdisk(vp,&error)) {
1717 			td->td_retval[0] = AIO_NOTCANCELED;
1718         	        return 0;
1719 		}
1720 	} else if (fp->f_type == DTYPE_SOCKET) {
1721 		so = (struct socket *)fp->f_data;
1722 
1723 		s = splnet();
1724 
1725 		for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1726 			cbn = TAILQ_NEXT(cbe, list);
1727 			if ((uap->aiocbp == NULL) ||
1728 				(uap->aiocbp == cbe->uuaiocb) ) {
1729 				po = cbe->userproc;
1730 				ki = po->p_aioinfo;
1731 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1732 				TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1733 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1734 				if (ki->kaio_flags & KAIO_WAKEUP) {
1735 					wakeup(po);
1736 				}
1737 				cbe->jobstate = JOBST_JOBFINISHED;
1738 				cbe->uaiocb._aiocb_private.status=-1;
1739 				cbe->uaiocb._aiocb_private.error=ECANCELED;
1740 				cancelled++;
1741 /* XXX cancelled, knote? */
1742 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1743 				    SIGEV_SIGNAL) {
1744 					PROC_LOCK(cbe->userproc);
1745 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1746 					PROC_UNLOCK(cbe->userproc);
1747 				}
1748 				if (uap->aiocbp)
1749 					break;
1750 			}
1751 		}
1752 		splx(s);
1753 
1754 		if ((cancelled) && (uap->aiocbp)) {
1755 			td->td_retval[0] = AIO_CANCELED;
1756 			return 0;
1757 		}
1758 	}
1759 	ki=p->p_aioinfo;
1760 	if (ki == NULL)
1761 		goto done;
1762 	s = splnet();
1763 
1764 	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1765 		cbn = TAILQ_NEXT(cbe, plist);
1766 
1767 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1768 		    ((uap->aiocbp == NULL ) ||
1769 		     (uap->aiocbp == cbe->uuaiocb))) {
1770 
1771 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1772 				TAILQ_REMOVE(&aio_jobs, cbe, list);
1773                                 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1774                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1775                                     plist);
1776 				cancelled++;
1777 				ki->kaio_queue_finished_count++;
1778 				cbe->jobstate = JOBST_JOBFINISHED;
1779 				cbe->uaiocb._aiocb_private.status = -1;
1780 				cbe->uaiocb._aiocb_private.error = ECANCELED;
1781 /* XXX cancelled, knote? */
1782 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1783 				    SIGEV_SIGNAL) {
1784 					PROC_LOCK(cbe->userproc);
1785 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1786 					PROC_UNLOCK(cbe->userproc);
1787 				}
1788 			} else {
1789 				notcancelled++;
1790 			}
1791 		}
1792 	}
1793 	splx(s);
1794 done:
1795 	if (notcancelled) {
1796 		td->td_retval[0] = AIO_NOTCANCELED;
1797 		return 0;
1798 	}
1799 	if (cancelled) {
1800 		td->td_retval[0] = AIO_CANCELED;
1801 		return 0;
1802 	}
1803 	td->td_retval[0] = AIO_ALLDONE;
1804 
1805 	return 0;
1806 }
1807 
1808 /*
1809  * aio_error is implemented in the kernel level for compatibility purposes only.
1810  * For a user mode async implementation, it would be best to do it in a userland
1811  * subroutine.
1812  */
1813 int
1814 aio_error(struct thread *td, struct aio_error_args *uap)
1815 {
1816 	struct proc *p = td->td_proc;
1817 	int s;
1818 	struct aiocblist *cb;
1819 	struct kaioinfo *ki;
1820 	long jobref;
1821 
1822 	ki = p->p_aioinfo;
1823 	if (ki == NULL)
1824 		return EINVAL;
1825 
1826 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1827 	if ((jobref == -1) || (jobref == 0))
1828 		return EINVAL;
1829 
1830 	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1831 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1832 		    jobref) {
1833 			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1834 			return 0;
1835 		}
1836 	}
1837 
1838 	s = splnet();
1839 
1840 	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1841 	    plist)) {
1842 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1843 		    jobref) {
1844 			td->td_retval[0] = EINPROGRESS;
1845 			splx(s);
1846 			return 0;
1847 		}
1848 	}
1849 
1850 	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1851 	    plist)) {
1852 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1853 		    jobref) {
1854 			td->td_retval[0] = EINPROGRESS;
1855 			splx(s);
1856 			return 0;
1857 		}
1858 	}
1859 	splx(s);
1860 
1861 	s = splbio();
1862 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1863 	    plist)) {
1864 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1865 		    jobref) {
1866 			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1867 			splx(s);
1868 			return 0;
1869 		}
1870 	}
1871 
1872 	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1873 	    plist)) {
1874 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1875 		    jobref) {
1876 			td->td_retval[0] = EINPROGRESS;
1877 			splx(s);
1878 			return 0;
1879 		}
1880 	}
1881 	splx(s);
1882 
1883 #if (0)
1884 	/*
1885 	 * Hack for lio.
1886 	 */
1887 	status = fuword(&uap->aiocbp->_aiocb_private.status);
1888 	if (status == -1)
1889 		return fuword(&uap->aiocbp->_aiocb_private.error);
1890 #endif
1891 	return EINVAL;
1892 }
1893 
1894 /* syscall - asynchronous read from a file (REALTIME) */
1895 int
1896 aio_read(struct thread *td, struct aio_read_args *uap)
1897 {
1898 
1899 	return aio_aqueue(td, uap->aiocbp, LIO_READ);
1900 }
1901 
1902 /* syscall - asynchronous write to a file (REALTIME) */
1903 int
1904 aio_write(struct thread *td, struct aio_write_args *uap)
1905 {
1906 
1907 	return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
1908 }
1909 
1910 /* syscall - XXX undocumented */
1911 int
1912 lio_listio(struct thread *td, struct lio_listio_args *uap)
1913 {
1914 	struct proc *p = td->td_proc;
1915 	int nent, nentqueued;
1916 	struct aiocb *iocb, * const *cbptr;
1917 	struct aiocblist *cb;
1918 	struct kaioinfo *ki;
1919 	struct aio_liojob *lj;
1920 	int error, runningcode;
1921 	int nerror;
1922 	int i;
1923 	int s;
1924 
1925 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1926 		return EINVAL;
1927 
1928 	nent = uap->nent;
1929 	if (nent > AIO_LISTIO_MAX)
1930 		return EINVAL;
1931 
1932 	if (p->p_aioinfo == NULL)
1933 		aio_init_aioinfo(p);
1934 
1935 	if ((nent + num_queue_count) > max_queue_count)
1936 		return EAGAIN;
1937 
1938 	ki = p->p_aioinfo;
1939 	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1940 		return EAGAIN;
1941 
1942 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
1943 	if (!lj)
1944 		return EAGAIN;
1945 
1946 	lj->lioj_flags = 0;
1947 	lj->lioj_buffer_count = 0;
1948 	lj->lioj_buffer_finished_count = 0;
1949 	lj->lioj_queue_count = 0;
1950 	lj->lioj_queue_finished_count = 0;
1951 	lj->lioj_ki = ki;
1952 
1953 	/*
1954 	 * Setup signal.
1955 	 */
1956 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1957 		error = copyin(uap->sig, &lj->lioj_signal,
1958 			       sizeof(lj->lioj_signal));
1959 		if (error) {
1960 			uma_zfree(aiolio_zone, lj);
1961 			return error;
1962 		}
1963 		if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
1964 			uma_zfree(aiolio_zone, lj);
1965 			return EINVAL;
1966 		}
1967 		lj->lioj_flags |= LIOJ_SIGNAL;
1968 	}
1969 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
1970 	/*
1971 	 * Get pointers to the list of I/O requests.
1972 	 */
1973 	nerror = 0;
1974 	nentqueued = 0;
1975 	cbptr = uap->acb_list;
1976 	for (i = 0; i < uap->nent; i++) {
1977 		iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
1978 		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
1979 			error = _aio_aqueue(td, iocb, lj, 0);
1980 			if (error == 0)
1981 				nentqueued++;
1982 			else
1983 				nerror++;
1984 		}
1985 	}
1986 
1987 	/*
1988 	 * If we haven't queued any, then just return error.
1989 	 */
1990 	if (nentqueued == 0)
1991 		return 0;
1992 
1993 	/*
1994 	 * Calculate the appropriate error return.
1995 	 */
1996 	runningcode = 0;
1997 	if (nerror)
1998 		runningcode = EIO;
1999 
2000 	if (uap->mode == LIO_WAIT) {
2001 		int command, found, jobref;
2002 
2003 		for (;;) {
2004 			found = 0;
2005 			for (i = 0; i < uap->nent; i++) {
2006 				/*
2007 				 * Fetch address of the control buf pointer in
2008 				 * user space.
2009 				 */
2010 				iocb = (struct aiocb *)
2011 				    (intptr_t)fuword(&cbptr[i]);
2012 				if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2013 				    == 0))
2014 					continue;
2015 
2016 				/*
2017 				 * Fetch the associated command from user space.
2018 				 */
2019 				command = fuword(&iocb->aio_lio_opcode);
2020 				if (command == LIO_NOP) {
2021 					found++;
2022 					continue;
2023 				}
2024 
2025 				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2026 
2027 				TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2028 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2029 					    == jobref) {
2030 						if (cb->uaiocb.aio_lio_opcode
2031 						    == LIO_WRITE) {
2032 							p->p_stats->p_ru.ru_oublock
2033 							    +=
2034 							    cb->outputcharge;
2035 							cb->outputcharge = 0;
2036 						} else if (cb->uaiocb.aio_lio_opcode
2037 						    == LIO_READ) {
2038 							p->p_stats->p_ru.ru_inblock
2039 							    += cb->inputcharge;
2040 							cb->inputcharge = 0;
2041 						}
2042 						found++;
2043 						break;
2044 					}
2045 				}
2046 
2047 				s = splbio();
2048 				TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2049 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2050 					    == jobref) {
2051 						found++;
2052 						break;
2053 					}
2054 				}
2055 				splx(s);
2056 			}
2057 
2058 			/*
2059 			 * If all I/Os have been disposed of, then we can
2060 			 * return.
2061 			 */
2062 			if (found == nentqueued)
2063 				return runningcode;
2064 
2065 			ki->kaio_flags |= KAIO_WAKEUP;
2066 			error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2067 
2068 			if (error == EINTR)
2069 				return EINTR;
2070 			else if (error == EWOULDBLOCK)
2071 				return EAGAIN;
2072 		}
2073 	}
2074 
2075 	return runningcode;
2076 }
2077 
2078 /*
2079  * This is a weird hack so that we can post a signal.  It is safe to do so from
2080  * a timeout routine, but *not* from an interrupt routine.
2081  */
2082 static void
2083 process_signal(void *aioj)
2084 {
2085 	struct aiocblist *aiocbe = aioj;
2086 	struct aio_liojob *lj = aiocbe->lio;
2087 	struct aiocb *cb = &aiocbe->uaiocb;
2088 
2089 	if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2090 		(lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2091 		PROC_LOCK(lj->lioj_ki->kaio_p);
2092 		psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2093 		PROC_UNLOCK(lj->lioj_ki->kaio_p);
2094 		lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2095 	}
2096 
2097 	if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2098 		PROC_LOCK(aiocbe->userproc);
2099 		psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2100 		PROC_UNLOCK(aiocbe->userproc);
2101 	}
2102 }
2103 
2104 /*
2105  * Interrupt handler for physio, performs the necessary process wakeups, and
2106  * signals.
2107  */
2108 static void
2109 aio_physwakeup(struct buf *bp)
2110 {
2111 	struct aiocblist *aiocbe;
2112 	struct proc *p;
2113 	struct kaioinfo *ki;
2114 	struct aio_liojob *lj;
2115 
2116 	wakeup(bp);
2117 
2118 	aiocbe = (struct aiocblist *)bp->b_spc;
2119 	if (aiocbe) {
2120 		p = bp->b_caller1;
2121 
2122 		aiocbe->jobstate = JOBST_JOBBFINISHED;
2123 		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2124 		aiocbe->uaiocb._aiocb_private.error = 0;
2125 		aiocbe->jobflags |= AIOCBLIST_DONE;
2126 
2127 		if (bp->b_ioflags & BIO_ERROR)
2128 			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2129 
2130 		lj = aiocbe->lio;
2131 		if (lj) {
2132 			lj->lioj_buffer_finished_count++;
2133 
2134 			/*
2135 			 * wakeup/signal if all of the interrupt jobs are done.
2136 			 */
2137 			if (lj->lioj_buffer_finished_count ==
2138 			    lj->lioj_buffer_count) {
2139 				/*
2140 				 * Post a signal if it is called for.
2141 				 */
2142 				if ((lj->lioj_flags &
2143 				    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2144 				    LIOJ_SIGNAL) {
2145 					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2146 					aiocbe->timeouthandle =
2147 						timeout(process_signal,
2148 							aiocbe, 0);
2149 				}
2150 			}
2151 		}
2152 
2153 		ki = p->p_aioinfo;
2154 		if (ki) {
2155 			ki->kaio_buffer_finished_count++;
2156 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2157 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2158 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2159 
2160 			KNOTE(&aiocbe->klist, 0);
2161 			/* Do the wakeup. */
2162 			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2163 				ki->kaio_flags &= ~KAIO_WAKEUP;
2164 				wakeup(p);
2165 			}
2166 		}
2167 
2168 		if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2169 			aiocbe->timeouthandle =
2170 				timeout(process_signal, aiocbe, 0);
2171 	}
2172 }
2173 
2174 /* syscall - wait for the next completion of an aio request */
2175 int
2176 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2177 {
2178 	struct proc *p = td->td_proc;
2179 	struct timeval atv;
2180 	struct timespec ts;
2181 	struct kaioinfo *ki;
2182 	struct aiocblist *cb = NULL;
2183 	int error, s, timo;
2184 
2185 	suword(uap->aiocbp, (int)NULL);
2186 
2187 	timo = 0;
2188 	if (uap->timeout) {
2189 		/* Get timespec struct. */
2190 		error = copyin(uap->timeout, &ts, sizeof(ts));
2191 		if (error)
2192 			return error;
2193 
2194 		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2195 			return (EINVAL);
2196 
2197 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
2198 		if (itimerfix(&atv))
2199 			return (EINVAL);
2200 		timo = tvtohz(&atv);
2201 	}
2202 
2203 	ki = p->p_aioinfo;
2204 	if (ki == NULL)
2205 		return EAGAIN;
2206 
2207 	for (;;) {
2208 		if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2209 			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2210 			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2211 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2212 				p->p_stats->p_ru.ru_oublock +=
2213 				    cb->outputcharge;
2214 				cb->outputcharge = 0;
2215 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2216 				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
2217 				cb->inputcharge = 0;
2218 			}
2219 			aio_free_entry(cb);
2220 			return cb->uaiocb._aiocb_private.error;
2221 		}
2222 
2223 		s = splbio();
2224  		if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2225 			splx(s);
2226 			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2227 			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2228 			aio_free_entry(cb);
2229 			return cb->uaiocb._aiocb_private.error;
2230 		}
2231 
2232 		ki->kaio_flags |= KAIO_WAKEUP;
2233 		error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2234 		splx(s);
2235 
2236 		if (error == ERESTART)
2237 			return EINTR;
2238 		else if (error < 0)
2239 			return error;
2240 		else if (error == EINTR)
2241 			return EINTR;
2242 		else if (error == EWOULDBLOCK)
2243 			return EAGAIN;
2244 	}
2245 }
2246 
2247 /* kqueue attach function */
2248 static int
2249 filt_aioattach(struct knote *kn)
2250 {
2251 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2252 
2253 	/*
2254 	 * The aiocbe pointer must be validated before using it, so
2255 	 * registration is restricted to the kernel; the user cannot
2256 	 * set EV_FLAG1.
2257 	 */
2258 	if ((kn->kn_flags & EV_FLAG1) == 0)
2259 		return (EPERM);
2260 	kn->kn_flags &= ~EV_FLAG1;
2261 
2262 	SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2263 
2264 	return (0);
2265 }
2266 
2267 /* kqueue detach function */
2268 static void
2269 filt_aiodetach(struct knote *kn)
2270 {
2271 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2272 
2273 	SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2274 }
2275 
2276 /* kqueue filter function */
2277 /*ARGSUSED*/
2278 static int
2279 filt_aio(struct knote *kn, long hint)
2280 {
2281 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2282 
2283 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2284 	if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2285 	    aiocbe->jobstate != JOBST_JOBBFINISHED)
2286 		return (0);
2287 	kn->kn_flags |= EV_EOF;
2288 	return (1);
2289 }
2290