xref: /freebsd/sys/kern/vfs_aio.c (revision 822923447e454b30d310cb46903c9ddeca9f0a7a)
1 /*-
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  */
16 
17 /*
18  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
19  */
20 
21 #include <sys/cdefs.h>
22 __FBSDID("$FreeBSD$");
23 
24 #include <sys/param.h>
25 #include <sys/systm.h>
26 #include <sys/malloc.h>
27 #include <sys/bio.h>
28 #include <sys/buf.h>
29 #include <sys/eventhandler.h>
30 #include <sys/sysproto.h>
31 #include <sys/filedesc.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/kthread.h>
35 #include <sys/fcntl.h>
36 #include <sys/file.h>
37 #include <sys/limits.h>
38 #include <sys/lock.h>
39 #include <sys/mutex.h>
40 #include <sys/unistd.h>
41 #include <sys/proc.h>
42 #include <sys/resourcevar.h>
43 #include <sys/signalvar.h>
44 #include <sys/protosw.h>
45 #include <sys/socketvar.h>
46 #include <sys/syscall.h>
47 #include <sys/sysent.h>
48 #include <sys/sysctl.h>
49 #include <sys/sx.h>
50 #include <sys/vnode.h>
51 #include <sys/conf.h>
52 #include <sys/event.h>
53 
54 #include <posix4/posix4.h>
55 #include <vm/vm.h>
56 #include <vm/vm_extern.h>
57 #include <vm/pmap.h>
58 #include <vm/vm_map.h>
59 #include <vm/uma.h>
60 #include <sys/aio.h>
61 
62 #include "opt_vfs_aio.h"
63 
64 NET_NEEDS_GIANT("aio");
65 
66 /*
67  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
68  * overflow.
69  */
70 static	long jobrefid;
71 
72 #define JOBST_NULL		0x0
73 #define JOBST_JOBQGLOBAL	0x2
74 #define JOBST_JOBRUNNING	0x3
75 #define JOBST_JOBFINISHED	0x4
76 #define	JOBST_JOBQBUF		0x5
77 #define	JOBST_JOBBFINISHED	0x6
78 
79 #ifndef MAX_AIO_PER_PROC
80 #define MAX_AIO_PER_PROC	32
81 #endif
82 
83 #ifndef MAX_AIO_QUEUE_PER_PROC
84 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
85 #endif
86 
87 #ifndef MAX_AIO_PROCS
88 #define MAX_AIO_PROCS		32
89 #endif
90 
91 #ifndef MAX_AIO_QUEUE
92 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
93 #endif
94 
95 #ifndef TARGET_AIO_PROCS
96 #define TARGET_AIO_PROCS	4
97 #endif
98 
99 #ifndef MAX_BUF_AIO
100 #define MAX_BUF_AIO		16
101 #endif
102 
103 #ifndef AIOD_TIMEOUT_DEFAULT
104 #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
105 #endif
106 
107 #ifndef AIOD_LIFETIME_DEFAULT
108 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
109 #endif
110 
111 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
112 
113 static int max_aio_procs = MAX_AIO_PROCS;
114 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
115 	CTLFLAG_RW, &max_aio_procs, 0,
116 	"Maximum number of kernel threads to use for handling async IO ");
117 
118 static int num_aio_procs = 0;
119 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
120 	CTLFLAG_RD, &num_aio_procs, 0,
121 	"Number of presently active kernel threads for async IO");
122 
123 /*
124  * The code will adjust the actual number of AIO processes towards this
125  * number when it gets a chance.
126  */
127 static int target_aio_procs = TARGET_AIO_PROCS;
128 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
129 	0, "Preferred number of ready kernel threads for async IO");
130 
131 static int max_queue_count = MAX_AIO_QUEUE;
132 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
133     "Maximum number of aio requests to queue, globally");
134 
135 static int num_queue_count = 0;
136 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
137     "Number of queued aio requests");
138 
139 static int num_buf_aio = 0;
140 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
141     "Number of aio requests presently handled by the buf subsystem");
142 
143 /* Number of async I/O thread in the process of being started */
144 /* XXX This should be local to _aio_aqueue() */
145 static int num_aio_resv_start = 0;
146 
147 static int aiod_timeout;
148 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
149     "Timeout value for synchronous aio operations");
150 
151 static int aiod_lifetime;
152 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
153     "Maximum lifetime for idle aiod");
154 
155 static int unloadable = 0;
156 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
157     "Allow unload of aio (not recommended)");
158 
159 
160 static int max_aio_per_proc = MAX_AIO_PER_PROC;
161 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
162     0, "Maximum active aio requests per process (stored in the process)");
163 
164 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
165 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
166     &max_aio_queue_per_proc, 0,
167     "Maximum queued aio requests per process (stored in the process)");
168 
169 static int max_buf_aio = MAX_BUF_AIO;
170 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
171     "Maximum buf aio requests per process (stored in the process)");
172 
173 struct aiocblist {
174 	TAILQ_ENTRY(aiocblist) list;	/* List of jobs */
175 	TAILQ_ENTRY(aiocblist) plist;	/* List of jobs for proc */
176 	int	jobflags;
177 	int	jobstate;
178 	int	inputcharge;
179 	int	outputcharge;
180 	struct	buf *bp;		/* Buffer pointer */
181 	struct	proc *userproc;		/* User process */ /* Not td! */
182 	struct  ucred *cred;		/* Active credential when created */
183 	struct	file *fd_file;		/* Pointer to file structure */
184 	struct	aio_liojob *lio;	/* Optional lio job */
185 	struct	aiocb *uuaiocb;		/* Pointer in userspace of aiocb */
186 	struct	knlist klist;		/* list of knotes */
187 	struct	aiocb uaiocb;		/* Kernel I/O control block */
188 };
189 
190 /* jobflags */
191 #define AIOCBLIST_RUNDOWN	0x4
192 #define AIOCBLIST_DONE		0x10
193 
194 /*
195  * AIO process info
196  */
197 #define AIOP_FREE	0x1			/* proc on free queue */
198 
199 struct aiothreadlist {
200 	int aiothreadflags;			/* AIO proc flags */
201 	TAILQ_ENTRY(aiothreadlist) list;	/* List of processes */
202 	struct thread *aiothread;		/* The AIO thread */
203 };
204 
205 /*
206  * data-structure for lio signal management
207  */
208 struct aio_liojob {
209 	int	lioj_flags;
210 	int	lioj_buffer_count;
211 	int	lioj_buffer_finished_count;
212 	int	lioj_queue_count;
213 	int	lioj_queue_finished_count;
214 	int	lioj_total_count;
215 	struct	sigevent lioj_signal;	/* signal on all I/O done */
216 	TAILQ_ENTRY(aio_liojob) lioj_list;
217         struct  knlist klist;		/* list of knotes */
218 };
219 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
220 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
221 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
222 
223 /*
224  * per process aio data structure
225  */
226 struct kaioinfo {
227 	int	kaio_flags;		/* per process kaio flags */
228 	int	kaio_maxactive_count;	/* maximum number of AIOs */
229 	int	kaio_active_count;	/* number of currently used AIOs */
230 	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
231 	int	kaio_queue_count;	/* size of AIO queue */
232 	int	kaio_ballowed_count;	/* maximum number of buffers */
233 	int	kaio_queue_finished_count; /* number of daemon jobs finished */
234 	int	kaio_buffer_count;	/* number of physio buffers */
235 	int	kaio_buffer_finished_count; /* count of I/O done */
236 	TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
237 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* job queue for process */
238 	TAILQ_HEAD(,aiocblist) kaio_jobdone;	/* done queue for process */
239 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* buffer job queue for process */
240 	TAILQ_HEAD(,aiocblist) kaio_bufdone;	/* buffer done queue for process */
241 	TAILQ_HEAD(,aiocblist) kaio_sockqueue;	/* queue for aios waiting on sockets */
242 };
243 
244 #define KAIO_RUNDOWN	0x1	/* process is being run down */
245 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
246 
247 static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* Idle daemons */
248 static struct mtx aio_freeproc_mtx;
249 
250 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
251 
252 static void	aio_init_aioinfo(struct proc *p);
253 static void	aio_onceonly(void);
254 static int	aio_free_entry(struct aiocblist *aiocbe);
255 static void	aio_process(struct aiocblist *aiocbe);
256 static int	aio_newproc(void);
257 static int	aio_aqueue(struct thread *td, struct aiocb *job, int type);
258 static void	aio_physwakeup(struct buf *bp);
259 static void	aio_proc_rundown(void *arg, struct proc *p);
260 static int	aio_fphysio(struct aiocblist *aiocbe);
261 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
262 static void	aio_daemon(void *uproc);
263 static void	aio_swake_cb(struct socket *, struct sockbuf *);
264 static int	aio_unload(void);
265 static int	filt_aioattach(struct knote *kn);
266 static void	filt_aiodetach(struct knote *kn);
267 static int	filt_aio(struct knote *kn, long hint);
268 static int	filt_lioattach(struct knote *kn);
269 static void	filt_liodetach(struct knote *kn);
270 static int	filt_lio(struct knote *kn, long hint);
271 #define DONE_BUF 1
272 #define DONE_QUEUE 2
273 static void	aio_bio_done_notify( struct proc *userp, struct aiocblist *aiocbe, int type);
274 
275 /*
276  * Zones for:
277  * 	kaio	Per process async io info
278  *	aiop	async io thread data
279  *	aiocb	async io jobs
280  *	aiol	list io job pointer - internal to aio_suspend XXX
281  *	aiolio	list io jobs
282  */
283 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
284 
285 /* kqueue filters for aio */
286 static struct filterops aio_filtops =
287 	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
288 static struct filterops lio_filtops =
289 	{ 0, filt_lioattach, filt_liodetach, filt_lio };
290 
291 static eventhandler_tag exit_tag, exec_tag;
292 
293 /*
294  * Main operations function for use as a kernel module.
295  */
296 static int
297 aio_modload(struct module *module, int cmd, void *arg)
298 {
299 	int error = 0;
300 
301 	switch (cmd) {
302 	case MOD_LOAD:
303 		aio_onceonly();
304 		break;
305 	case MOD_UNLOAD:
306 		error = aio_unload();
307 		break;
308 	case MOD_SHUTDOWN:
309 		break;
310 	default:
311 		error = EINVAL;
312 		break;
313 	}
314 	return (error);
315 }
316 
317 static moduledata_t aio_mod = {
318 	"aio",
319 	&aio_modload,
320 	NULL
321 };
322 
323 SYSCALL_MODULE_HELPER(aio_return);
324 SYSCALL_MODULE_HELPER(aio_suspend);
325 SYSCALL_MODULE_HELPER(aio_cancel);
326 SYSCALL_MODULE_HELPER(aio_error);
327 SYSCALL_MODULE_HELPER(aio_read);
328 SYSCALL_MODULE_HELPER(aio_write);
329 SYSCALL_MODULE_HELPER(aio_waitcomplete);
330 SYSCALL_MODULE_HELPER(lio_listio);
331 
332 DECLARE_MODULE(aio, aio_mod,
333 	SI_SUB_VFS, SI_ORDER_ANY);
334 MODULE_VERSION(aio, 1);
335 
336 /*
337  * Startup initialization
338  */
339 static void
340 aio_onceonly(void)
341 {
342 
343 	/* XXX: should probably just use so->callback */
344 	aio_swake = &aio_swake_cb;
345 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
346 	    EVENTHANDLER_PRI_ANY);
347 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown, NULL,
348 	    EVENTHANDLER_PRI_ANY);
349 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
350 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
351 	TAILQ_INIT(&aio_freeproc);
352 	mtx_init(&aio_freeproc_mtx, "aio_freeproc", NULL, MTX_DEF);
353 	TAILQ_INIT(&aio_jobs);
354 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
355 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
356 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
357 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
358 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
359 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
360 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
361 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
362 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aio_liojob), NULL,
363 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
364 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
365 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
366 	jobrefid = 1;
367 	async_io_version = _POSIX_VERSION;
368 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
369 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
370 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
371 }
372 
373 /*
374  * Callback for unload of AIO when used as a module.
375  */
376 static int
377 aio_unload(void)
378 {
379 	int error;
380 
381 	/*
382 	 * XXX: no unloads by default, it's too dangerous.
383 	 * perhaps we could do it if locked out callers and then
384 	 * did an aio_proc_rundown() on each process.
385 	 */
386 	if (!unloadable)
387 		return (EOPNOTSUPP);
388 
389 	error = kqueue_del_filteropts(EVFILT_AIO);
390 	if (error)
391 		return error;
392 
393 	async_io_version = 0;
394 	aio_swake = NULL;
395 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
396 	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
397 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
398 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
399 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
400 	return (0);
401 }
402 
403 /*
404  * Init the per-process aioinfo structure.  The aioinfo limits are set
405  * per-process for user limit (resource) management.
406  */
407 static void
408 aio_init_aioinfo(struct proc *p)
409 {
410 	struct kaioinfo *ki;
411 
412 	ki = uma_zalloc(kaio_zone, M_WAITOK);
413 	ki->kaio_flags = 0;
414 	ki->kaio_maxactive_count = max_aio_per_proc;
415 	ki->kaio_active_count = 0;
416 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
417 	ki->kaio_queue_count = 0;
418 	ki->kaio_ballowed_count = max_buf_aio;
419 	ki->kaio_buffer_count = 0;
420 	ki->kaio_buffer_finished_count = 0;
421 	TAILQ_INIT(&ki->kaio_jobdone);
422 	TAILQ_INIT(&ki->kaio_jobqueue);
423 	TAILQ_INIT(&ki->kaio_bufdone);
424 	TAILQ_INIT(&ki->kaio_bufqueue);
425 	TAILQ_INIT(&ki->kaio_liojoblist);
426 	TAILQ_INIT(&ki->kaio_sockqueue);
427 	PROC_LOCK(p);
428 	if (p->p_aioinfo == NULL) {
429 		p->p_aioinfo = ki;
430 		PROC_UNLOCK(p);
431 	} else {
432 		PROC_UNLOCK(p);
433 		uma_zfree(kaio_zone, ki);
434 	}
435 
436 	while (num_aio_procs < target_aio_procs)
437 		aio_newproc();
438 }
439 
440 /*
441  * Free a job entry.  Wait for completion if it is currently active, but don't
442  * delay forever.  If we delay, we return a flag that says that we have to
443  * restart the queue scan.
444  */
445 static int
446 aio_free_entry(struct aiocblist *aiocbe)
447 {
448 	struct kaioinfo *ki;
449 	struct aio_liojob *lj;
450 	struct proc *p;
451 	int error;
452 	int s;
453 
454 	if (aiocbe->jobstate == JOBST_NULL)
455 		panic("aio_free_entry: freeing already free job");
456 
457 	p = aiocbe->userproc;
458 	ki = p->p_aioinfo;
459 	lj = aiocbe->lio;
460 	if (ki == NULL)
461 		panic("aio_free_entry: missing p->p_aioinfo");
462 
463 	while (aiocbe->jobstate == JOBST_JOBRUNNING) {
464 		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
465 		tsleep(aiocbe, PRIBIO, "jobwai", 0);
466 	}
467 	if (aiocbe->bp == NULL) {
468 		if (ki->kaio_queue_count <= 0)
469 			panic("aio_free_entry: process queue size <= 0");
470 		if (num_queue_count <= 0)
471 			panic("aio_free_entry: system wide queue size <= 0");
472 
473 		if (lj) {
474 			lj->lioj_queue_count--;
475 			if (aiocbe->jobflags & AIOCBLIST_DONE)
476 				lj->lioj_queue_finished_count--;
477 		}
478 		ki->kaio_queue_count--;
479 		if (aiocbe->jobflags & AIOCBLIST_DONE)
480 			ki->kaio_queue_finished_count--;
481 		num_queue_count--;
482 	} else {
483 		if (lj) {
484 			lj->lioj_buffer_count--;
485 			if (aiocbe->jobflags & AIOCBLIST_DONE)
486 				lj->lioj_buffer_finished_count--;
487 		}
488 		if (aiocbe->jobflags & AIOCBLIST_DONE)
489 			ki->kaio_buffer_finished_count--;
490 		ki->kaio_buffer_count--;
491 		num_buf_aio--;
492 	}
493 
494 	/* aiocbe is going away, we need to destroy any knotes */
495 	/* XXXKSE Note the thread here is used to eventually find the
496 	 * owning process again, but it is also used to do a fo_close
497 	 * and that requires the thread. (but does it require the
498 	 * OWNING thread? (or maybe the running thread?)
499 	 * There is a semantic problem here...
500 	 */
501 	if (lj)
502 		knlist_delete(&lj->klist, FIRST_THREAD_IN_PROC(p), 0); /* XXXKSE */
503 	knlist_delete(&aiocbe->klist, FIRST_THREAD_IN_PROC(p), 0); /* XXXKSE */
504 
505 	if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
506 	    && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
507 		ki->kaio_flags &= ~KAIO_WAKEUP;
508 		wakeup(p);
509 	}
510 
511 	if (aiocbe->jobstate == JOBST_JOBQBUF) {
512 		if ((error = aio_fphysio(aiocbe)) != 0)
513 			return (error);
514 		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
515 			panic("aio_free_entry: invalid physio finish-up state");
516 		s = splbio();
517 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
518 		splx(s);
519 	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
520 		s = splnet();
521 		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
522 		TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
523 		splx(s);
524 	} else if (aiocbe->jobstate == JOBST_JOBFINISHED)
525 		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
526 	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
527 		s = splbio();
528 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
529 		splx(s);
530 		if (aiocbe->bp) {
531 			vunmapbuf(aiocbe->bp);
532 			relpbuf(aiocbe->bp, NULL);
533 			aiocbe->bp = NULL;
534 		}
535 	}
536 	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
537 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
538 		uma_zfree(aiolio_zone, lj);
539 	}
540 	aiocbe->jobstate = JOBST_NULL;
541 	fdrop(aiocbe->fd_file, curthread);
542 	crfree(aiocbe->cred);
543 	uma_zfree(aiocb_zone, aiocbe);
544 	return (0);
545 }
546 
547 /*
548  * Rundown the jobs for a given process.
549  */
550 static void
551 aio_proc_rundown(void *arg, struct proc *p)
552 {
553 	int s;
554 	struct kaioinfo *ki;
555 	struct aio_liojob *lj, *ljn;
556 	struct aiocblist *aiocbe, *aiocbn;
557 	struct file *fp;
558 	struct socket *so;
559 
560 	ki = p->p_aioinfo;
561 	if (ki == NULL)
562 		return;
563 
564 	mtx_lock(&Giant);
565 	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
566 	while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
567 	    ki->kaio_buffer_finished_count)) {
568 		ki->kaio_flags |= KAIO_RUNDOWN;
569 		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
570 			break;
571 	}
572 
573 	/*
574 	 * Move any aio ops that are waiting on socket I/O to the normal job
575 	 * queues so they are cleaned up with any others.
576 	 */
577 	s = splnet();
578 	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
579 	    aiocbn) {
580 		aiocbn = TAILQ_NEXT(aiocbe, plist);
581 		fp = aiocbe->fd_file;
582 		if (fp != NULL) {
583 			so = fp->f_data;
584 			TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
585 			if (TAILQ_EMPTY(&so->so_aiojobq)) {
586 				SOCKBUF_LOCK(&so->so_snd);
587 				so->so_snd.sb_flags &= ~SB_AIO;
588 				SOCKBUF_UNLOCK(&so->so_snd);
589 				SOCKBUF_LOCK(&so->so_rcv);
590 				so->so_rcv.sb_flags &= ~SB_AIO;
591 				SOCKBUF_UNLOCK(&so->so_rcv);
592 			}
593 		}
594 		TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
595 		TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
596 		TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
597 	}
598 	splx(s);
599 
600 restart1:
601 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
602 		aiocbn = TAILQ_NEXT(aiocbe, plist);
603 		if (aio_free_entry(aiocbe))
604 			goto restart1;
605 	}
606 
607 restart2:
608 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
609 	    aiocbn) {
610 		aiocbn = TAILQ_NEXT(aiocbe, plist);
611 		if (aio_free_entry(aiocbe))
612 			goto restart2;
613 	}
614 
615 /*
616  * Note the use of lots of splbio here, trying to avoid splbio for long chains
617  * of I/O.  Probably unnecessary.
618  */
619 restart3:
620 	s = splbio();
621 	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
622 		ki->kaio_flags |= KAIO_WAKEUP;
623 		tsleep(p, PRIBIO, "aioprn", 0);
624 		splx(s);
625 		goto restart3;
626 	}
627 	splx(s);
628 
629 restart4:
630 	s = splbio();
631 	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
632 		aiocbn = TAILQ_NEXT(aiocbe, plist);
633 		if (aio_free_entry(aiocbe)) {
634 			splx(s);
635 			goto restart4;
636 		}
637 	}
638 	splx(s);
639 
640 	/*
641 	 * If we've slept, jobs might have moved from one queue to another.
642 	 * Retry rundown if we didn't manage to empty the queues.
643 	 */
644 	if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
645 	    TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
646 	    TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
647 	    TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
648 		goto restart1;
649 
650 	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
651 		ljn = TAILQ_NEXT(lj, lioj_list);
652 		if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
653 		    0)) {
654 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
655 			uma_zfree(aiolio_zone, lj);
656 		} else {
657 #ifdef DIAGNOSTIC
658 			printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
659 			    "QF:%d\n", lj->lioj_buffer_count,
660 			    lj->lioj_buffer_finished_count,
661 			    lj->lioj_queue_count,
662 			    lj->lioj_queue_finished_count);
663 #endif
664 		}
665 	}
666 
667 	uma_zfree(kaio_zone, ki);
668 	p->p_aioinfo = NULL;
669 	mtx_unlock(&Giant);
670 }
671 
672 /*
673  * Select a job to run (called by an AIO daemon).
674  */
675 static struct aiocblist *
676 aio_selectjob(struct aiothreadlist *aiop)
677 {
678 	int s;
679 	struct aiocblist *aiocbe;
680 	struct kaioinfo *ki;
681 	struct proc *userp;
682 
683 	s = splnet();
684 	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
685 	    TAILQ_NEXT(aiocbe, list)) {
686 		userp = aiocbe->userproc;
687 		ki = userp->p_aioinfo;
688 
689 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
690 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
691 			splx(s);
692 			return (aiocbe);
693 		}
694 	}
695 	splx(s);
696 
697 	return (NULL);
698 }
699 
700 /*
701  * The AIO processing activity.  This is the code that does the I/O request for
702  * the non-physio version of the operations.  The normal vn operations are used,
703  * and this code should work in all instances for every type of file, including
704  * pipes, sockets, fifos, and regular files.
705  */
706 static void
707 aio_process(struct aiocblist *aiocbe)
708 {
709 	struct ucred *td_savedcred;
710 	struct thread *td;
711 	struct proc *mycp;
712 	struct aiocb *cb;
713 	struct file *fp;
714 	struct uio auio;
715 	struct iovec aiov;
716 	int cnt;
717 	int error;
718 	int oublock_st, oublock_end;
719 	int inblock_st, inblock_end;
720 
721 	td = curthread;
722 	td_savedcred = td->td_ucred;
723 	td->td_ucred = aiocbe->cred;
724 	mycp = td->td_proc;
725 	cb = &aiocbe->uaiocb;
726 	fp = aiocbe->fd_file;
727 
728 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
729 	aiov.iov_len = cb->aio_nbytes;
730 
731 	auio.uio_iov = &aiov;
732 	auio.uio_iovcnt = 1;
733 	auio.uio_offset = cb->aio_offset;
734 	auio.uio_resid = cb->aio_nbytes;
735 	cnt = cb->aio_nbytes;
736 	auio.uio_segflg = UIO_USERSPACE;
737 	auio.uio_td = td;
738 
739 	inblock_st = mycp->p_stats->p_ru.ru_inblock;
740 	oublock_st = mycp->p_stats->p_ru.ru_oublock;
741 	/*
742 	 * _aio_aqueue() acquires a reference to the file that is
743 	 * released in aio_free_entry().
744 	 */
745 	if (cb->aio_lio_opcode == LIO_READ) {
746 		auio.uio_rw = UIO_READ;
747 		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
748 	} else {
749 		auio.uio_rw = UIO_WRITE;
750 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
751 	}
752 	inblock_end = mycp->p_stats->p_ru.ru_inblock;
753 	oublock_end = mycp->p_stats->p_ru.ru_oublock;
754 
755 	aiocbe->inputcharge = inblock_end - inblock_st;
756 	aiocbe->outputcharge = oublock_end - oublock_st;
757 
758 	if ((error) && (auio.uio_resid != cnt)) {
759 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
760 			error = 0;
761 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
762 			PROC_LOCK(aiocbe->userproc);
763 			psignal(aiocbe->userproc, SIGPIPE);
764 			PROC_UNLOCK(aiocbe->userproc);
765 		}
766 	}
767 
768 	cnt -= auio.uio_resid;
769 	cb->_aiocb_private.error = error;
770 	cb->_aiocb_private.status = cnt;
771 	td->td_ucred = td_savedcred;
772 }
773 
774 static void
775 aio_bio_done_notify( struct proc *userp, struct aiocblist *aiocbe, int type){
776 	int lj_done;
777 	struct aio_liojob *lj;
778 	struct kaioinfo *ki;
779 
780 	ki = userp->p_aioinfo;
781 	lj = aiocbe->lio;
782 	lj_done = 0;
783 	if (lj) {
784 		if (type == DONE_QUEUE)
785 			lj->lioj_queue_finished_count++;
786 		else
787 			lj->lioj_buffer_finished_count++;
788 		if (lj->lioj_queue_finished_count +
789 		    lj->lioj_buffer_finished_count ==
790 		    lj->lioj_total_count)
791 			lj_done = 1;
792 	}
793 
794 	if (ki) {
795 		if (type == DONE_QUEUE) {
796 			ki->kaio_queue_finished_count++;
797 			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
798 			TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist);
799 		} else {
800 			ki->kaio_buffer_finished_count++;
801 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
802 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
803 		}
804 		if (lj_done) {
805 			if (!knlist_empty(&lj->klist)
806 			    && lj->lioj_signal.sigev_notify ==
807 			    SIGEV_KEVENT) {
808 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
809 				KNOTE_UNLOCKED(&lj->klist, 0);
810 			}
811 			if ((lj->lioj_flags &
812 			     (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
813 			    == LIOJ_SIGNAL
814 			    && lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) {
815 				PROC_LOCK(userp);
816 				psignal(userp, lj->lioj_signal.sigev_signo);
817 				PROC_UNLOCK(userp);
818 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
819 			}
820 		}
821 		KNOTE_UNLOCKED(&aiocbe->klist, 0);
822 
823 		if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
824 			ki->kaio_flags &= ~KAIO_WAKEUP;
825 			wakeup(userp);
826 		}
827 	}
828 
829 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
830 		PROC_LOCK(userp);
831 		psignal(userp, aiocbe->uaiocb.aio_sigevent.sigev_signo);
832 		PROC_UNLOCK(userp);
833 	}
834 }
835 /*
836  * The AIO daemon, most of the actual work is done in aio_process,
837  * but the setup (and address space mgmt) is done in this routine.
838  */
839 static void
840 aio_daemon(void *uproc)
841 {
842 	int s;
843 	struct aiocb *cb;
844 	struct aiocblist *aiocbe;
845 	struct aiothreadlist *aiop;
846 	struct kaioinfo *ki;
847 	struct proc *curcp, *mycp, *userp;
848 	struct vmspace *myvm, *tmpvm;
849 	struct thread *td = curthread;
850 	struct pgrp *newpgrp;
851 	struct session *newsess;
852 
853 	/*
854 	 * Local copies of curproc (cp) and vmspace (myvm)
855 	 */
856 	mycp = td->td_proc;
857 	myvm = mycp->p_vmspace;
858 
859 	KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));
860 
861 	/*
862 	 * Allocate and ready the aio control info.  There is one aiop structure
863 	 * per daemon.
864 	 */
865 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
866 	aiop->aiothread = td;
867 	aiop->aiothreadflags |= AIOP_FREE;
868 
869 	/*
870 	 * Place thread (lightweight process) onto the AIO free thread list.
871 	 */
872 	mtx_lock(&aio_freeproc_mtx);
873 	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
874 	mtx_unlock(&aio_freeproc_mtx);
875 
876 	/*
877 	 * Get rid of our current filedescriptors.  AIOD's don't need any
878 	 * filedescriptors, except as temporarily inherited from the client.
879 	 */
880 	mtx_lock(&Giant);
881 	fdfree(td);
882 
883 	mtx_unlock(&Giant);
884 	/* The daemon resides in its own pgrp. */
885 	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP,
886 		M_WAITOK | M_ZERO);
887 	MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION,
888 		M_WAITOK | M_ZERO);
889 
890 	sx_xlock(&proctree_lock);
891 	enterpgrp(mycp, mycp->p_pid, newpgrp, newsess);
892 	sx_xunlock(&proctree_lock);
893 	mtx_lock(&Giant);
894 
895 	/*
896 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
897 	 * and creating too many daemons.)
898 	 */
899 	wakeup(mycp);
900 
901 	for (;;) {
902 		/*
903 		 * curcp is the current daemon process context.
904 		 * userp is the current user process context.
905 		 */
906 		curcp = mycp;
907 
908 		/*
909 		 * Take daemon off of free queue
910 		 */
911 		mtx_lock(&aio_freeproc_mtx);
912 		if (aiop->aiothreadflags & AIOP_FREE) {
913 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
914 			aiop->aiothreadflags &= ~AIOP_FREE;
915 		}
916 		mtx_unlock(&aio_freeproc_mtx);
917 
918 		/*
919 		 * Check for jobs.
920 		 */
921 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
922 			cb = &aiocbe->uaiocb;
923 			userp = aiocbe->userproc;
924 
925 			aiocbe->jobstate = JOBST_JOBRUNNING;
926 
927 			/*
928 			 * Connect to process address space for user program.
929 			 */
930 			if (userp != curcp) {
931 				/*
932 				 * Save the current address space that we are
933 				 * connected to.
934 				 */
935 				tmpvm = mycp->p_vmspace;
936 
937 				/*
938 				 * Point to the new user address space, and
939 				 * refer to it.
940 				 */
941 				mycp->p_vmspace = userp->p_vmspace;
942 				atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);
943 
944 				/* Activate the new mapping. */
945 				pmap_activate(FIRST_THREAD_IN_PROC(mycp));
946 
947 				/*
948 				 * If the old address space wasn't the daemons
949 				 * own address space, then we need to remove the
950 				 * daemon's reference from the other process
951 				 * that it was acting on behalf of.
952 				 */
953 				if (tmpvm != myvm) {
954 					vmspace_free(tmpvm);
955 				}
956 				curcp = userp;
957 			}
958 
959 			ki = userp->p_aioinfo;
960 
961 			/* Account for currently active jobs. */
962 			ki->kaio_active_count++;
963 
964 			/* Do the I/O function. */
965 			aio_process(aiocbe);
966 
967 			s = splbio();
968 			/* Decrement the active job count. */
969 			ki->kaio_active_count--;
970 
971 			aiocbe->jobflags |= AIOCBLIST_DONE;
972 			aiocbe->jobstate = JOBST_JOBFINISHED;
973 			aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
974 			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
975 				wakeup(aiocbe);
976 				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
977 			}
978 		}
979 
980 		/*
981 		 * Disconnect from user address space.
982 		 */
983 		if (curcp != mycp) {
984 			/* Get the user address space to disconnect from. */
985 			tmpvm = mycp->p_vmspace;
986 
987 			/* Get original address space for daemon. */
988 			mycp->p_vmspace = myvm;
989 
990 			/* Activate the daemon's address space. */
991 			pmap_activate(FIRST_THREAD_IN_PROC(mycp));
992 #ifdef DIAGNOSTIC
993 			if (tmpvm == myvm) {
994 				printf("AIOD: vmspace problem -- %d\n",
995 				    mycp->p_pid);
996 			}
997 #endif
998 			/* Remove our vmspace reference. */
999 			vmspace_free(tmpvm);
1000 
1001 			curcp = mycp;
1002 		}
1003 
1004 		mtx_lock(&aio_freeproc_mtx);
1005 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
1006 		aiop->aiothreadflags |= AIOP_FREE;
1007 
1008 		/*
1009 		 * If daemon is inactive for a long time, allow it to exit,
1010 		 * thereby freeing resources.
1011 		 */
1012 		if (msleep(aiop->aiothread, &aio_freeproc_mtx, PDROP | PRIBIO,
1013 		    "aiordy", aiod_lifetime)) {
1014 			s = splnet();
1015 			if (TAILQ_EMPTY(&aio_jobs)) {
1016 				mtx_lock(&aio_freeproc_mtx);
1017 				if ((aiop->aiothreadflags & AIOP_FREE) &&
1018 				    (num_aio_procs > target_aio_procs)) {
1019 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
1020 					mtx_unlock(&aio_freeproc_mtx);
1021 					splx(s);
1022 					uma_zfree(aiop_zone, aiop);
1023 					num_aio_procs--;
1024 #ifdef DIAGNOSTIC
1025 					if (mycp->p_vmspace->vm_refcnt <= 1) {
1026 						printf("AIOD: bad vm refcnt for"
1027 						    " exiting daemon: %d\n",
1028 						    mycp->p_vmspace->vm_refcnt);
1029 					}
1030 #endif
1031 					kthread_exit(0);
1032 				}
1033 				mtx_unlock(&aio_freeproc_mtx);
1034 			}
1035 			splx(s);
1036 		}
1037 	}
1038 }
1039 
1040 /*
1041  * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
1042  * AIO daemon modifies its environment itself.
1043  */
1044 static int
1045 aio_newproc(void)
1046 {
1047 	int error;
1048 	struct proc *p;
1049 
1050 	error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, 0, "aiod%d",
1051 	    num_aio_procs);
1052 	if (error)
1053 		return (error);
1054 
1055 	/*
1056 	 * Wait until daemon is started, but continue on just in case to
1057 	 * handle error conditions.
1058 	 */
1059 	error = tsleep(p, PZERO, "aiosta", aiod_timeout);
1060 
1061 	num_aio_procs++;
1062 
1063 	return (error);
1064 }
1065 
1066 /*
1067  * Try the high-performance, low-overhead physio method for eligible
1068  * VCHR devices.  This method doesn't use an aio helper thread, and
1069  * thus has very low overhead.
1070  *
1071  * Assumes that the caller, _aio_aqueue(), has incremented the file
1072  * structure's reference count, preventing its deallocation for the
1073  * duration of this call.
1074  */
1075 static int
1076 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1077 {
1078 	int error;
1079 	struct aiocb *cb;
1080 	struct file *fp;
1081 	struct buf *bp;
1082 	struct vnode *vp;
1083 	struct kaioinfo *ki;
1084 	struct aio_liojob *lj;
1085 	int s, lj_done = 0;
1086 	int notify;
1087 
1088 	cb = &aiocbe->uaiocb;
1089 	fp = aiocbe->fd_file;
1090 
1091 	if (fp->f_type != DTYPE_VNODE)
1092 		return (-1);
1093 
1094 	vp = fp->f_vnode;
1095 
1096 	/*
1097 	 * If its not a disk, we don't want to return a positive error.
1098 	 * It causes the aio code to not fall through to try the thread
1099 	 * way when you're talking to a regular file.
1100 	 */
1101 	if (!vn_isdisk(vp, &error)) {
1102 		if (error == ENOTBLK)
1103 			return (-1);
1104 		else
1105 			return (error);
1106 	}
1107 
1108  	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
1109 		return (-1);
1110 
1111 	if (cb->aio_nbytes > vp->v_rdev->si_iosize_max)
1112 		return (-1);
1113 
1114 	if (cb->aio_nbytes >
1115 	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1116 		return (-1);
1117 
1118 	ki = p->p_aioinfo;
1119 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1120 		return (-1);
1121 
1122 	ki->kaio_buffer_count++;
1123 
1124 	lj = aiocbe->lio;
1125 	if (lj)
1126 		lj->lioj_buffer_count++;
1127 
1128 	/* Create and build a buffer header for a transfer. */
1129 	bp = (struct buf *)getpbuf(NULL);
1130 	BUF_KERNPROC(bp);
1131 
1132 	/*
1133 	 * Get a copy of the kva from the physical buffer.
1134 	 */
1135 	error = 0;
1136 
1137 	bp->b_bcount = cb->aio_nbytes;
1138 	bp->b_bufsize = cb->aio_nbytes;
1139 	bp->b_iodone = aio_physwakeup;
1140 	bp->b_saveaddr = bp->b_data;
1141 	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1142 	bp->b_offset = cb->aio_offset;
1143 	bp->b_iooffset = cb->aio_offset;
1144 	bp->b_blkno = btodb(cb->aio_offset);
1145 	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
1146 
1147 	/*
1148 	 * Bring buffer into kernel space.
1149 	 */
1150 	if (vmapbuf(bp) < 0) {
1151 		error = EFAULT;
1152 		goto doerror;
1153 	}
1154 
1155 	s = splbio();
1156 	aiocbe->bp = bp;
1157 	bp->b_caller1 = (void *)aiocbe;
1158 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1159 	aiocbe->jobstate = JOBST_JOBQBUF;
1160 	cb->_aiocb_private.status = cb->aio_nbytes;
1161 	num_buf_aio++;
1162 	bp->b_error = 0;
1163 
1164 	splx(s);
1165 
1166 	/* Perform transfer. */
1167 	dev_strategy(vp->v_rdev, bp);
1168 
1169 	notify = 0;
1170 	s = splbio();
1171 
1172 	/*
1173 	 * If we had an error invoking the request, or an error in processing
1174 	 * the request before we have returned, we process it as an error in
1175 	 * transfer.  Note that such an I/O error is not indicated immediately,
1176 	 * but is returned using the aio_error mechanism.  In this case,
1177 	 * aio_suspend will return immediately.
1178 	 */
1179 	if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1180 		struct aiocb *job = aiocbe->uuaiocb;
1181 
1182 		aiocbe->uaiocb._aiocb_private.status = 0;
1183 		suword(&job->_aiocb_private.status, 0);
1184 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1185 		suword(&job->_aiocb_private.error, bp->b_error);
1186 
1187 		if (lj) {
1188 			lj->lioj_buffer_finished_count++;
1189 			if (lj->lioj_queue_finished_count +
1190 			    lj->lioj_buffer_finished_count ==
1191 			    lj->lioj_total_count)
1192 			    lj_done = 1;
1193 		}
1194 
1195 		ki->kaio_buffer_finished_count++;
1196 
1197 		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1198 			aiocbe->jobstate = JOBST_JOBBFINISHED;
1199 			aiocbe->jobflags |= AIOCBLIST_DONE;
1200 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1201 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1202 			notify = 1;
1203 		}
1204 	}
1205 	splx(s);
1206 	if (notify) {
1207 		if (lj && !knlist_empty(&lj->klist)) {
1208 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
1209 			KNOTE_UNLOCKED(&lj->klist, 0);
1210 		}
1211 		KNOTE_UNLOCKED(&aiocbe->klist, 0);
1212 
1213 	}
1214 	if (cb->aio_lio_opcode == LIO_WRITE) {
1215 		aiocbe->outputcharge += btodb(cb->aio_nbytes);
1216 	} else if (cb->aio_lio_opcode == LIO_READ) {
1217 		aiocbe->inputcharge += btodb(cb->aio_nbytes);
1218 	}
1219 	return (0);
1220 
1221 doerror:
1222 	ki->kaio_buffer_count--;
1223 	if (lj)
1224 		lj->lioj_buffer_count--;
1225 	aiocbe->bp = NULL;
1226 	relpbuf(bp, NULL);
1227 	return (error);
1228 }
1229 
1230 /*
1231  * This waits/tests physio completion.
1232  */
1233 static int
1234 aio_fphysio(struct aiocblist *iocb)
1235 {
1236 	int s;
1237 	struct buf *bp;
1238 	int error;
1239 
1240 	bp = iocb->bp;
1241 
1242 	s = splbio();
1243 	while ((bp->b_flags & B_DONE) == 0) {
1244 		if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
1245 			if ((bp->b_flags & B_DONE) == 0) {
1246 				splx(s);
1247 				return (EINPROGRESS);
1248 			} else
1249 				break;
1250 		}
1251 	}
1252 	splx(s);
1253 
1254 	/* Release mapping into kernel space. */
1255 	vunmapbuf(bp);
1256 	iocb->bp = 0;
1257 
1258 	error = 0;
1259 
1260 	/* Check for an error. */
1261 	if (bp->b_ioflags & BIO_ERROR)
1262 		error = bp->b_error;
1263 
1264 	relpbuf(bp, NULL);
1265 	return (error);
1266 }
1267 
1268 /*
1269  * Wake up aio requests that may be serviceable now.
1270  */
1271 static void
1272 aio_swake_cb(struct socket *so, struct sockbuf *sb)
1273 {
1274 	struct aiocblist *cb,*cbn;
1275 	struct proc *p;
1276 	struct kaioinfo *ki = NULL;
1277 	int opcode, wakecount = 0;
1278 	struct aiothreadlist *aiop;
1279 
1280 	if (sb == &so->so_snd) {
1281 		opcode = LIO_WRITE;
1282 		SOCKBUF_LOCK(&so->so_snd);
1283 		so->so_snd.sb_flags &= ~SB_AIO;
1284 		SOCKBUF_UNLOCK(&so->so_snd);
1285 	} else {
1286 		opcode = LIO_READ;
1287 		SOCKBUF_LOCK(&so->so_rcv);
1288 		so->so_rcv.sb_flags &= ~SB_AIO;
1289 		SOCKBUF_UNLOCK(&so->so_rcv);
1290 	}
1291 
1292 	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1293 		cbn = TAILQ_NEXT(cb, list);
1294 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1295 			p = cb->userproc;
1296 			ki = p->p_aioinfo;
1297 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1298 			TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1299 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1300 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1301 			wakecount++;
1302 			if (cb->jobstate != JOBST_JOBQGLOBAL)
1303 				panic("invalid queue value");
1304 		}
1305 	}
1306 
1307 	while (wakecount--) {
1308 		mtx_lock(&aio_freeproc_mtx);
1309 		if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1310 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1311 			aiop->aiothreadflags &= ~AIOP_FREE;
1312 			wakeup(aiop->aiothread);
1313 		}
1314 		mtx_unlock(&aio_freeproc_mtx);
1315 	}
1316 }
1317 
1318 /*
1319  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1320  * technique is done in this code.
1321  */
1322 static int
1323 _aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
1324 {
1325 	struct proc *p = td->td_proc;
1326 	struct filedesc *fdp;
1327 	struct file *fp;
1328 	unsigned int fd;
1329 	struct socket *so;
1330 	int s;
1331 	int error;
1332 	int opcode;
1333 	struct aiocblist *aiocbe;
1334 	struct aiothreadlist *aiop;
1335 	struct kaioinfo *ki;
1336 	struct kevent kev;
1337 	struct kqueue *kq;
1338 	struct file *kq_fp;
1339 	struct sockbuf *sb;
1340 
1341 	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK);
1342 	aiocbe->inputcharge = 0;
1343 	aiocbe->outputcharge = 0;
1344 	/* XXX - need a lock */
1345 	knlist_init(&aiocbe->klist, NULL, NULL, NULL, NULL);
1346 
1347 	suword(&job->_aiocb_private.status, -1);
1348 	suword(&job->_aiocb_private.error, 0);
1349 	suword(&job->_aiocb_private.kernelinfo, -1);
1350 
1351 	error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1352 	if (error) {
1353 		suword(&job->_aiocb_private.error, error);
1354 		uma_zfree(aiocb_zone, aiocbe);
1355 		return (error);
1356 	}
1357 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1358 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1359 		uma_zfree(aiocb_zone, aiocbe);
1360 		return (EINVAL);
1361 	}
1362 
1363 	/* Save userspace address of the job info. */
1364 	aiocbe->uuaiocb = job;
1365 
1366 	/* Get the opcode. */
1367 	if (type != LIO_NOP)
1368 		aiocbe->uaiocb.aio_lio_opcode = type;
1369 	opcode = aiocbe->uaiocb.aio_lio_opcode;
1370 
1371 	/* Get the fd info for process. */
1372 	fdp = p->p_fd;
1373 
1374 	/*
1375 	 * Range check file descriptor.
1376 	 */
1377 	FILEDESC_LOCK(fdp);
1378 	fd = aiocbe->uaiocb.aio_fildes;
1379 	if (fd >= fdp->fd_nfiles) {
1380 		FILEDESC_UNLOCK(fdp);
1381 		uma_zfree(aiocb_zone, aiocbe);
1382 		if (type == 0)
1383 			suword(&job->_aiocb_private.error, EBADF);
1384 		return (EBADF);
1385 	}
1386 
1387 	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1388 	if ((fp == NULL) ||
1389 	    ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0)) ||
1390 	    ((opcode == LIO_READ) && ((fp->f_flag & FREAD) == 0))) {
1391 		FILEDESC_UNLOCK(fdp);
1392 		uma_zfree(aiocb_zone, aiocbe);
1393 		if (type == 0)
1394 			suword(&job->_aiocb_private.error, EBADF);
1395 		return (EBADF);
1396 	}
1397 	fhold(fp);
1398 	FILEDESC_UNLOCK(fdp);
1399 
1400 	if (aiocbe->uaiocb.aio_offset == -1LL) {
1401 		error = EINVAL;
1402 		goto aqueue_fail;
1403 	}
1404 	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1405 	if (error) {
1406 		error = EINVAL;
1407 		goto aqueue_fail;
1408 	}
1409 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1410 	if (jobrefid == LONG_MAX)
1411 		jobrefid = 1;
1412 	else
1413 		jobrefid++;
1414 
1415 	if (opcode == LIO_NOP) {
1416 		fdrop(fp, td);
1417 		uma_zfree(aiocb_zone, aiocbe);
1418 		if (type == 0) {
1419 			suword(&job->_aiocb_private.error, 0);
1420 			suword(&job->_aiocb_private.status, 0);
1421 			suword(&job->_aiocb_private.kernelinfo, 0);
1422 		}
1423 		return (0);
1424 	}
1425 	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1426 		if (type == 0)
1427 			suword(&job->_aiocb_private.status, 0);
1428 		error = EINVAL;
1429 		goto aqueue_fail;
1430 	}
1431 
1432 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1433 		kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1434 		kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1435 	} else
1436 		goto no_kqueue;
1437 	if ((u_int)kev.ident >= fdp->fd_nfiles ||
1438 	    (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1439 	    (kq_fp->f_type != DTYPE_KQUEUE)) {
1440 		error = EBADF;
1441 		goto aqueue_fail;
1442 	}
1443 	kq = kq_fp->f_data;
1444 	kev.ident = (uintptr_t)aiocbe->uuaiocb;
1445 	kev.filter = EVFILT_AIO;
1446 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1447 	kev.data = (intptr_t)aiocbe;
1448 	error = kqueue_register(kq, &kev, td, 1);
1449 aqueue_fail:
1450 	if (error) {
1451 		fdrop(fp, td);
1452 		uma_zfree(aiocb_zone, aiocbe);
1453 		if (type == 0)
1454 			suword(&job->_aiocb_private.error, error);
1455 		goto done;
1456 	}
1457 no_kqueue:
1458 
1459 	suword(&job->_aiocb_private.error, EINPROGRESS);
1460 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1461 	aiocbe->userproc = p;
1462 	aiocbe->cred = crhold(td->td_ucred);
1463 	aiocbe->jobflags = 0;
1464 	aiocbe->lio = lj;
1465 	ki = p->p_aioinfo;
1466 
1467 	if (fp->f_type == DTYPE_SOCKET) {
1468 		/*
1469 		 * Alternate queueing for socket ops: Reach down into the
1470 		 * descriptor to get the socket data.  Then check to see if the
1471 		 * socket is ready to be read or written (based on the requested
1472 		 * operation).
1473 		 *
1474 		 * If it is not ready for io, then queue the aiocbe on the
1475 		 * socket, and set the flags so we get a call when sbnotify()
1476 		 * happens.
1477 		 *
1478 		 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock
1479 		 * and unlock the snd sockbuf for no reason.
1480 		 */
1481 		so = fp->f_data;
1482 		sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
1483 		SOCKBUF_LOCK(sb);
1484 		s = splnet();
1485 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1486 		    LIO_WRITE) && (!sowriteable(so)))) {
1487 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1488 			TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1489 			sb->sb_flags |= SB_AIO;
1490 			aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1491 			ki->kaio_queue_count++;
1492 			num_queue_count++;
1493 			SOCKBUF_UNLOCK(sb);
1494 			splx(s);
1495 			error = 0;
1496 			goto done;
1497 		}
1498 		SOCKBUF_UNLOCK(sb);
1499 		splx(s);
1500 	}
1501 
1502 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1503 		goto done;
1504 	if (error > 0) {
1505 		suword(&job->_aiocb_private.status, 0);
1506 		aiocbe->uaiocb._aiocb_private.error = error;
1507 		suword(&job->_aiocb_private.error, error);
1508 		goto done;
1509 	}
1510 
1511 	/* No buffer for daemon I/O. */
1512 	aiocbe->bp = NULL;
1513 
1514 	ki->kaio_queue_count++;
1515 	if (lj)
1516 		lj->lioj_queue_count++;
1517 	s = splnet();
1518 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1519 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1520 	splx(s);
1521 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1522 
1523 	num_queue_count++;
1524 	error = 0;
1525 
1526 	/*
1527 	 * If we don't have a free AIO process, and we are below our quota, then
1528 	 * start one.  Otherwise, depend on the subsequent I/O completions to
1529 	 * pick-up this job.  If we don't sucessfully create the new process
1530 	 * (thread) due to resource issues, we return an error for now (EAGAIN),
1531 	 * which is likely not the correct thing to do.
1532 	 */
1533 	mtx_lock(&aio_freeproc_mtx);
1534 retryproc:
1535 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1536 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1537 		aiop->aiothreadflags &= ~AIOP_FREE;
1538 		wakeup(aiop->aiothread);
1539 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1540 	    ((ki->kaio_active_count + num_aio_resv_start) <
1541 	    ki->kaio_maxactive_count)) {
1542 		num_aio_resv_start++;
1543 		mtx_unlock(&aio_freeproc_mtx);
1544 		if ((error = aio_newproc()) == 0) {
1545 			mtx_lock(&aio_freeproc_mtx);
1546 			num_aio_resv_start--;
1547 			goto retryproc;
1548 		}
1549 		mtx_lock(&aio_freeproc_mtx);
1550 		num_aio_resv_start--;
1551 	}
1552 	mtx_unlock(&aio_freeproc_mtx);
1553 done:
1554 	return (error);
1555 }
1556 
1557 /*
1558  * This routine queues an AIO request, checking for quotas.
1559  */
1560 static int
1561 aio_aqueue(struct thread *td, struct aiocb *job, int type)
1562 {
1563 	struct proc *p = td->td_proc;
1564 	struct kaioinfo *ki;
1565 
1566 	if (p->p_aioinfo == NULL)
1567 		aio_init_aioinfo(p);
1568 
1569 	if (num_queue_count >= max_queue_count)
1570 		return (EAGAIN);
1571 
1572 	ki = p->p_aioinfo;
1573 	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1574 		return (EAGAIN);
1575 
1576 	return _aio_aqueue(td, job, NULL, type);
1577 }
1578 
1579 /*
1580  * Support the aio_return system call, as a side-effect, kernel resources are
1581  * released.
1582  */
1583 int
1584 aio_return(struct thread *td, struct aio_return_args *uap)
1585 {
1586 	struct proc *p = td->td_proc;
1587 	int s;
1588 	long jobref;
1589 	struct aiocblist *cb, *ncb;
1590 	struct aiocb *ujob;
1591 	struct kaioinfo *ki;
1592 
1593 	ujob = uap->aiocbp;
1594 	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1595 	if (jobref == -1 || jobref == 0)
1596 		return (EINVAL);
1597 
1598 	ki = p->p_aioinfo;
1599 	if (ki == NULL)
1600 		return (EINVAL);
1601 	PROC_LOCK(p);
1602 	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1603 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1604 		    jobref)
1605 			goto done;
1606 	}
1607 
1608 	s = splbio();
1609 	/* aio_physwakeup */
1610 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1611 		ncb = TAILQ_NEXT(cb, plist);
1612 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1613 		    == jobref) {
1614 			break;
1615 		}
1616 	}
1617 	splx(s);
1618  done:
1619 	PROC_UNLOCK(p);
1620 	if (cb != NULL) {
1621 		if (ujob == cb->uuaiocb) {
1622 			td->td_retval[0] =
1623 			    cb->uaiocb._aiocb_private.status;
1624 		} else
1625 			td->td_retval[0] = EFAULT;
1626 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1627 			p->p_stats->p_ru.ru_oublock +=
1628 			    cb->outputcharge;
1629 			cb->outputcharge = 0;
1630 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1631 			p->p_stats->p_ru.ru_inblock += cb->inputcharge;
1632 			cb->inputcharge = 0;
1633 		}
1634 		aio_free_entry(cb);
1635 		return (0);
1636 	}
1637 	return (EINVAL);
1638 }
1639 
1640 /*
1641  * Allow a process to wakeup when any of the I/O requests are completed.
1642  */
1643 int
1644 aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1645 {
1646 	struct proc *p = td->td_proc;
1647 	struct timeval atv;
1648 	struct timespec ts;
1649 	struct aiocb *const *cbptr, *cbp;
1650 	struct kaioinfo *ki;
1651 	struct aiocblist *cb;
1652 	int i;
1653 	int njoblist;
1654 	int error, s, timo;
1655 	long *ijoblist;
1656 	struct aiocb **ujoblist;
1657 
1658 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
1659 		return (EINVAL);
1660 
1661 	timo = 0;
1662 	if (uap->timeout) {
1663 		/* Get timespec struct. */
1664 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1665 			return (error);
1666 
1667 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1668 			return (EINVAL);
1669 
1670 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
1671 		if (itimerfix(&atv))
1672 			return (EINVAL);
1673 		timo = tvtohz(&atv);
1674 	}
1675 
1676 	ki = p->p_aioinfo;
1677 	if (ki == NULL)
1678 		return (EAGAIN);
1679 
1680 	njoblist = 0;
1681 	ijoblist = uma_zalloc(aiol_zone, M_WAITOK);
1682 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
1683 	cbptr = uap->aiocbp;
1684 
1685 	for (i = 0; i < uap->nent; i++) {
1686 		cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
1687 		if (cbp == 0)
1688 			continue;
1689 		ujoblist[njoblist] = cbp;
1690 		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1691 		njoblist++;
1692 	}
1693 
1694 	if (njoblist == 0) {
1695 		uma_zfree(aiol_zone, ijoblist);
1696 		uma_zfree(aiol_zone, ujoblist);
1697 		return (0);
1698 	}
1699 
1700 	error = 0;
1701 	for (;;) {
1702 		PROC_LOCK(p);
1703 		TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1704 			for (i = 0; i < njoblist; i++) {
1705 				if (((intptr_t)
1706 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1707 				    ijoblist[i]) {
1708 					PROC_UNLOCK(p);
1709 					if (ujoblist[i] != cb->uuaiocb)
1710 						error = EINVAL;
1711 					uma_zfree(aiol_zone, ijoblist);
1712 					uma_zfree(aiol_zone, ujoblist);
1713 					return (error);
1714 				}
1715 			}
1716 		}
1717 
1718 		s = splbio();
1719 		for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1720 		    TAILQ_NEXT(cb, plist)) {
1721 			for (i = 0; i < njoblist; i++) {
1722 				if (((intptr_t)
1723 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1724 				    ijoblist[i]) {
1725 					PROC_UNLOCK(p);
1726 					splx(s);
1727 					if (ujoblist[i] != cb->uuaiocb)
1728 						error = EINVAL;
1729 					uma_zfree(aiol_zone, ijoblist);
1730 					uma_zfree(aiol_zone, ujoblist);
1731 					return (error);
1732 				}
1733 			}
1734 		}
1735 
1736 		ki->kaio_flags |= KAIO_WAKEUP;
1737 		error = msleep(p, &p->p_mtx, PDROP | PRIBIO | PCATCH, "aiospn",
1738 		    timo);
1739 		splx(s);
1740 
1741 		if (error == ERESTART || error == EINTR) {
1742 			uma_zfree(aiol_zone, ijoblist);
1743 			uma_zfree(aiol_zone, ujoblist);
1744 			return (EINTR);
1745 		} else if (error == EWOULDBLOCK) {
1746 			uma_zfree(aiol_zone, ijoblist);
1747 			uma_zfree(aiol_zone, ujoblist);
1748 			return (EAGAIN);
1749 		}
1750 	}
1751 
1752 /* NOTREACHED */
1753 	return (EINVAL);
1754 }
1755 
1756 /*
1757  * aio_cancel cancels any non-physio aio operations not currently in
1758  * progress.
1759  */
1760 int
1761 aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1762 {
1763 	struct proc *p = td->td_proc;
1764 	struct kaioinfo *ki;
1765 	struct aiocblist *cbe, *cbn;
1766 	struct file *fp;
1767 	struct filedesc *fdp;
1768 	struct socket *so;
1769 	struct proc *po;
1770 	int s,error;
1771 	int cancelled=0;
1772 	int notcancelled=0;
1773 	struct vnode *vp;
1774 
1775 	fdp = p->p_fd;
1776 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
1777 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
1778 		return (EBADF);
1779 
1780 	if (fp->f_type == DTYPE_VNODE) {
1781 		vp = fp->f_vnode;
1782 
1783 		if (vn_isdisk(vp,&error)) {
1784 			td->td_retval[0] = AIO_NOTCANCELED;
1785 			return (0);
1786 		}
1787 	} else if (fp->f_type == DTYPE_SOCKET) {
1788 		so = fp->f_data;
1789 
1790 		s = splnet();
1791 
1792 		for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1793 			cbn = TAILQ_NEXT(cbe, list);
1794 			if ((uap->aiocbp == NULL) ||
1795 				(uap->aiocbp == cbe->uuaiocb) ) {
1796 				po = cbe->userproc;
1797 				ki = po->p_aioinfo;
1798 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1799 				TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1800 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1801 				if (ki->kaio_flags & KAIO_WAKEUP) {
1802 					wakeup(po);
1803 				}
1804 				cbe->jobstate = JOBST_JOBFINISHED;
1805 				cbe->uaiocb._aiocb_private.status=-1;
1806 				cbe->uaiocb._aiocb_private.error=ECANCELED;
1807 				cancelled++;
1808 /* XXX cancelled, knote? */
1809 				if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1810 				    SIGEV_SIGNAL) {
1811 					PROC_LOCK(cbe->userproc);
1812 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1813 					PROC_UNLOCK(cbe->userproc);
1814 				}
1815 				if (uap->aiocbp)
1816 					break;
1817 			}
1818 		}
1819 		splx(s);
1820 
1821 		if ((cancelled) && (uap->aiocbp)) {
1822 			td->td_retval[0] = AIO_CANCELED;
1823 			return (0);
1824 		}
1825 	}
1826 	ki=p->p_aioinfo;
1827 	if (ki == NULL)
1828 		goto done;
1829 	s = splnet();
1830 
1831 	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1832 		cbn = TAILQ_NEXT(cbe, plist);
1833 
1834 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1835 		    ((uap->aiocbp == NULL ) ||
1836 		     (uap->aiocbp == cbe->uuaiocb))) {
1837 
1838 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1839 				TAILQ_REMOVE(&aio_jobs, cbe, list);
1840 				cbe->jobstate = JOBST_JOBFINISHED;
1841 				cancelled++;
1842 				cbe->uaiocb._aiocb_private.status = -1;
1843 				cbe->uaiocb._aiocb_private.error = ECANCELED;
1844 				aio_bio_done_notify(cbe->userproc, cbe, DONE_QUEUE);
1845 			} else {
1846 				notcancelled++;
1847 			}
1848 		}
1849 	}
1850 	splx(s);
1851 done:
1852 	if (notcancelled) {
1853 		td->td_retval[0] = AIO_NOTCANCELED;
1854 		return (0);
1855 	}
1856 	if (cancelled) {
1857 		td->td_retval[0] = AIO_CANCELED;
1858 		return (0);
1859 	}
1860 	td->td_retval[0] = AIO_ALLDONE;
1861 
1862 	return (0);
1863 }
1864 
1865 /*
1866  * aio_error is implemented in the kernel level for compatibility purposes only.
1867  * For a user mode async implementation, it would be best to do it in a userland
1868  * subroutine.
1869  */
1870 int
1871 aio_error(struct thread *td, struct aio_error_args *uap)
1872 {
1873 	struct proc *p = td->td_proc;
1874 	int s;
1875 	struct aiocblist *cb;
1876 	struct kaioinfo *ki;
1877 	long jobref;
1878 
1879 	ki = p->p_aioinfo;
1880 	if (ki == NULL)
1881 		return (EINVAL);
1882 
1883 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1884 	if ((jobref == -1) || (jobref == 0))
1885 		return (EINVAL);
1886 
1887 	PROC_LOCK(p);
1888 	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1889 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1890 		    jobref) {
1891 			PROC_UNLOCK(p);
1892 			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1893 			return (0);
1894 		}
1895 	}
1896 
1897 	s = splnet();
1898 
1899 	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1900 	    plist)) {
1901 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1902 		    jobref) {
1903 			PROC_UNLOCK(p);
1904 			td->td_retval[0] = EINPROGRESS;
1905 			splx(s);
1906 			return (0);
1907 		}
1908 	}
1909 
1910 	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1911 	    plist)) {
1912 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1913 		    jobref) {
1914 			PROC_UNLOCK(p);
1915 			td->td_retval[0] = EINPROGRESS;
1916 			splx(s);
1917 			return (0);
1918 		}
1919 	}
1920 	splx(s);
1921 
1922 	s = splbio();
1923 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1924 	    plist)) {
1925 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1926 		    jobref) {
1927 			PROC_UNLOCK(p);
1928 			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1929 			splx(s);
1930 			return (0);
1931 		}
1932 	}
1933 
1934 	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1935 	    plist)) {
1936 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1937 		    jobref) {
1938 			PROC_UNLOCK(p);
1939 			td->td_retval[0] = EINPROGRESS;
1940 			splx(s);
1941 			return (0);
1942 		}
1943 	}
1944 	splx(s);
1945 	PROC_UNLOCK(p);
1946 
1947 #if (0)
1948 	/*
1949 	 * Hack for lio.
1950 	 */
1951 	status = fuword(&uap->aiocbp->_aiocb_private.status);
1952 	if (status == -1)
1953 		return fuword(&uap->aiocbp->_aiocb_private.error);
1954 #endif
1955 	return (EINVAL);
1956 }
1957 
1958 /* syscall - asynchronous read from a file (REALTIME) */
1959 int
1960 aio_read(struct thread *td, struct aio_read_args *uap)
1961 {
1962 
1963 	return aio_aqueue(td, uap->aiocbp, LIO_READ);
1964 }
1965 
1966 /* syscall - asynchronous write to a file (REALTIME) */
1967 int
1968 aio_write(struct thread *td, struct aio_write_args *uap)
1969 {
1970 
1971 	return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
1972 }
1973 
1974 /* syscall - list directed I/O (REALTIME) */
1975 int
1976 lio_listio(struct thread *td, struct lio_listio_args *uap)
1977 {
1978 	struct proc *p = td->td_proc;
1979 	int nent, nentqueued;
1980 	struct aiocb *iocb, * const *cbptr;
1981 	struct aiocblist *cb;
1982 	struct kaioinfo *ki;
1983 	struct aio_liojob *lj;
1984 	struct kevent kev;
1985 	struct kqueue * kq;
1986 	struct file *kq_fp;
1987 	int error, runningcode;
1988 	int nerror;
1989 	int i;
1990 
1991 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1992 		return (EINVAL);
1993 
1994 	nent = uap->nent;
1995 	if (nent < 0 || nent > AIO_LISTIO_MAX)
1996 		return (EINVAL);
1997 
1998 	if (p->p_aioinfo == NULL)
1999 		aio_init_aioinfo(p);
2000 
2001 	if ((nent + num_queue_count) > max_queue_count)
2002 		return (EAGAIN);
2003 
2004 	ki = p->p_aioinfo;
2005 	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
2006 		return (EAGAIN);
2007 
2008 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
2009 	if (!lj)
2010 		return (EAGAIN);
2011 
2012 	lj->lioj_flags = 0;
2013 	lj->lioj_buffer_count = 0;
2014 	lj->lioj_buffer_finished_count = 0;
2015 	lj->lioj_queue_count = 0;
2016 	lj->lioj_queue_finished_count = 0;
2017 	lj->lioj_total_count = nent;
2018 	knlist_init(&lj->klist, NULL, NULL, NULL, NULL);
2019 
2020 	kev.ident = 0;
2021 
2022 	/*
2023 	 * Setup signal.
2024 	 */
2025 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2026 		error = copyin(uap->sig, &lj->lioj_signal,
2027 		    sizeof(lj->lioj_signal));
2028 		if (error) {
2029 			uma_zfree(aiolio_zone, lj);
2030 			return (error);
2031 		}
2032 
2033 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2034 			/* Assume only new style KEVENT */
2035 			kev.ident = lj->lioj_signal.sigev_notify_kqueue;
2036 			kev.udata = lj->lioj_signal.sigev_value.sigval_ptr;
2037 
2038 			if ((u_int)kev.ident >= p->p_fd->fd_nfiles ||
2039 			    (kq_fp = p->p_fd->fd_ofiles[kev.ident]) == NULL ||
2040 			    (kq_fp->f_type != DTYPE_KQUEUE)) {
2041 				uma_zfree(aiolio_zone, lj);
2042 				return (EBADF);
2043 			}
2044 			kq = (struct kqueue *)kq_fp->f_data;
2045 			kev.filter = EVFILT_LIO;
2046 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
2047 			kev.ident = (uintptr_t)lj; /* something unique */
2048 			kev.data = (intptr_t)lj;
2049 			error = kqueue_register(kq, &kev, td, 1);
2050 			if (error) {
2051 				uma_zfree(aiolio_zone, lj);
2052 				return (error);
2053 			}
2054 		} else 	if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2055 			uma_zfree(aiolio_zone, lj);
2056 			return EINVAL;
2057 		} else {
2058 			lj->lioj_flags |= LIOJ_SIGNAL;
2059 			lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
2060 		}
2061 	} else
2062 		lj->lioj_flags &= ~LIOJ_SIGNAL;
2063 
2064 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2065 	/*
2066 	 * Get pointers to the list of I/O requests.
2067 	 */
2068 	nerror = 0;
2069 	nentqueued = 0;
2070 	cbptr = uap->acb_list;
2071 	for (i = 0; i < uap->nent; i++) {
2072 		iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
2073 		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
2074 			error = _aio_aqueue(td, iocb, lj, 0);
2075 			if (error == 0)
2076 				nentqueued++;
2077 			else
2078 				nerror++;
2079 		}
2080 	}
2081 
2082 	/*
2083 	 * If we haven't queued any, then just return error.
2084 	 */
2085 	if (nentqueued == 0)
2086 		return (0);
2087 
2088 	/*
2089 	 * Calculate the appropriate error return.
2090 	 */
2091 	runningcode = 0;
2092 	if (nerror)
2093 		runningcode = EIO;
2094 
2095 	if (uap->mode == LIO_WAIT) {
2096 		int command, found;
2097 		long jobref;
2098 
2099 		for (;;) {
2100 			found = 0;
2101 			for (i = 0; i < uap->nent; i++) {
2102 				/*
2103 				 * Fetch address of the control buf pointer in
2104 				 * user space.
2105 				 */
2106 				iocb = (struct aiocb *)
2107 				    (intptr_t)fuword(&cbptr[i]);
2108 				if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2109 				    == 0))
2110 					continue;
2111 
2112 				/*
2113 				 * Fetch the associated command from user space.
2114 				 */
2115 				command = fuword(&iocb->aio_lio_opcode);
2116 				if (command == LIO_NOP) {
2117 					found++;
2118 					continue;
2119 				}
2120 
2121 				jobref =
2122 				    fuword(&iocb->_aiocb_private.kernelinfo);
2123 
2124 				TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2125 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2126 					    == jobref) {
2127 						if (cb->uaiocb.aio_lio_opcode
2128 						    == LIO_WRITE) {
2129 							p->p_stats->p_ru.ru_oublock
2130 							    +=
2131 							    cb->outputcharge;
2132 							cb->outputcharge = 0;
2133 						} else if (cb->uaiocb.aio_lio_opcode
2134 						    == LIO_READ) {
2135 							p->p_stats->p_ru.ru_inblock
2136 							    += cb->inputcharge;
2137 							cb->inputcharge = 0;
2138 						}
2139 						found++;
2140 						break;
2141 					}
2142 				}
2143 
2144 				TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2145 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2146 					    == jobref) {
2147 						found++;
2148 						break;
2149 					}
2150 				}
2151 			}
2152 
2153 			/*
2154 			 * If all I/Os have been disposed of, then we can
2155 			 * return.
2156 			 */
2157 			if (found == nentqueued)
2158 				return (runningcode);
2159 
2160 			ki->kaio_flags |= KAIO_WAKEUP;
2161 			error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2162 
2163 			if (error == EINTR)
2164 				return (EINTR);
2165 			else if (error == EWOULDBLOCK)
2166 				return (EAGAIN);
2167 		}
2168 	}
2169 
2170 	return (runningcode);
2171 }
2172 
2173 /*
2174  * Interrupt handler for physio, performs the necessary process wakeups, and
2175  * signals.
2176  */
2177 static void
2178 aio_physwakeup(struct buf *bp)
2179 {
2180 	struct aiocblist *aiocbe;
2181 	struct proc *userp;
2182 
2183 	mtx_lock(&Giant);
2184 	bp->b_flags |= B_DONE;
2185 	wakeup(bp);
2186 
2187 	aiocbe = (struct aiocblist *)bp->b_caller1;
2188 	if (aiocbe) {
2189 		userp = aiocbe->userproc;
2190 
2191 		aiocbe->jobstate = JOBST_JOBBFINISHED;
2192 		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2193 		aiocbe->uaiocb._aiocb_private.error = 0;
2194 		aiocbe->jobflags |= AIOCBLIST_DONE;
2195 
2196 		if (bp->b_ioflags & BIO_ERROR)
2197 			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2198 
2199 		aio_bio_done_notify(userp, aiocbe, DONE_BUF);
2200 	}
2201 	mtx_unlock(&Giant);
2202 }
2203 
2204 /* syscall - wait for the next completion of an aio request */
2205 int
2206 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2207 {
2208 	struct proc *p = td->td_proc;
2209 	struct timeval atv;
2210 	struct timespec ts;
2211 	struct kaioinfo *ki;
2212 	struct aiocblist *cb = NULL;
2213 	int error, s, timo;
2214 
2215 	suword(uap->aiocbp, (int)NULL);
2216 
2217 	timo = 0;
2218 	if (uap->timeout) {
2219 		/* Get timespec struct. */
2220 		error = copyin(uap->timeout, &ts, sizeof(ts));
2221 		if (error)
2222 			return (error);
2223 
2224 		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2225 			return (EINVAL);
2226 
2227 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
2228 		if (itimerfix(&atv))
2229 			return (EINVAL);
2230 		timo = tvtohz(&atv);
2231 	}
2232 
2233 	ki = p->p_aioinfo;
2234 	if (ki == NULL)
2235 		return (EAGAIN);
2236 
2237 	for (;;) {
2238 		PROC_LOCK(p);
2239 		if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2240 			PROC_UNLOCK(p);
2241 			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2242 			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2243 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2244 				p->p_stats->p_ru.ru_oublock +=
2245 				    cb->outputcharge;
2246 				cb->outputcharge = 0;
2247 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2248 				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
2249 				cb->inputcharge = 0;
2250 			}
2251 			error = cb->uaiocb._aiocb_private.error;
2252 			aio_free_entry(cb);
2253 			return (error);
2254 		}
2255 
2256 		s = splbio();
2257  		if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2258 			PROC_UNLOCK(p);
2259 			splx(s);
2260 			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2261 			error = cb->uaiocb._aiocb_private.error;
2262 			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2263 			aio_free_entry(cb);
2264 			return (error);
2265 		}
2266 
2267 		ki->kaio_flags |= KAIO_WAKEUP;
2268 		error = msleep(p, &p->p_mtx, PDROP | PRIBIO | PCATCH, "aiowc",
2269 		    timo);
2270 		splx(s);
2271 
2272 		if (error == ERESTART)
2273 			return (EINTR);
2274 		else if (error < 0)
2275 			return (error);
2276 		else if (error == EINTR)
2277 			return (EINTR);
2278 		else if (error == EWOULDBLOCK)
2279 			return (EAGAIN);
2280 	}
2281 }
2282 
2283 /* kqueue attach function */
2284 static int
2285 filt_aioattach(struct knote *kn)
2286 {
2287 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2288 
2289 	/*
2290 	 * The aiocbe pointer must be validated before using it, so
2291 	 * registration is restricted to the kernel; the user cannot
2292 	 * set EV_FLAG1.
2293 	 */
2294 	if ((kn->kn_flags & EV_FLAG1) == 0)
2295 		return (EPERM);
2296 	kn->kn_flags &= ~EV_FLAG1;
2297 
2298 	knlist_add(&aiocbe->klist, kn, 0);
2299 
2300 	return (0);
2301 }
2302 
2303 /* kqueue detach function */
2304 static void
2305 filt_aiodetach(struct knote *kn)
2306 {
2307 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2308 
2309 	if (!knlist_empty(&aiocbe->klist))
2310 		knlist_remove(&aiocbe->klist, kn, 0);
2311 }
2312 
2313 /* kqueue filter function */
2314 /*ARGSUSED*/
2315 static int
2316 filt_aio(struct knote *kn, long hint)
2317 {
2318 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2319 
2320 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2321 	if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2322 	    aiocbe->jobstate != JOBST_JOBBFINISHED)
2323 		return (0);
2324 	kn->kn_flags |= EV_EOF;
2325 	return (1);
2326 }
2327 
2328 /* kqueue attach function */
2329 static int
2330 filt_lioattach(struct knote *kn)
2331 {
2332 	struct aio_liojob * lj = (struct aio_liojob *)kn->kn_sdata;
2333 
2334 	/*
2335 	 * The aio_liojob pointer must be validated before using it, so
2336 	 * registration is restricted to the kernel; the user cannot
2337 	 * set EV_FLAG1.
2338 	 */
2339 	if ((kn->kn_flags & EV_FLAG1) == 0)
2340 		return (EPERM);
2341 	kn->kn_flags &= ~EV_FLAG1;
2342 
2343 	knlist_add(&lj->klist, kn, 0);
2344 
2345 	return (0);
2346 }
2347 
2348 /* kqueue detach function */
2349 static void
2350 filt_liodetach(struct knote *kn)
2351 {
2352 	struct aio_liojob * lj = (struct aio_liojob *)kn->kn_sdata;
2353 
2354 	if (!knlist_empty(&lj->klist))
2355 		knlist_remove(&lj->klist, kn, 0);
2356 }
2357 
2358 /* kqueue filter function */
2359 /*ARGSUSED*/
2360 static int
2361 filt_lio(struct knote *kn, long hint)
2362 {
2363 	struct aio_liojob * lj = (struct aio_liojob *)kn->kn_sdata;
2364 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
2365 }
2366