xref: /freebsd/sys/kern/vfs_aio.c (revision 23090366f729c56cab62de74c7a51792357e98a9)
1 /*-
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  */
16 
17 /*
18  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
19  */
20 
21 #include <sys/cdefs.h>
22 __FBSDID("$FreeBSD$");
23 
24 #include "opt_compat.h"
25 
26 #include <sys/param.h>
27 #include <sys/systm.h>
28 #include <sys/malloc.h>
29 #include <sys/bio.h>
30 #include <sys/buf.h>
31 #include <sys/capability.h>
32 #include <sys/eventhandler.h>
33 #include <sys/sysproto.h>
34 #include <sys/filedesc.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/kthread.h>
38 #include <sys/fcntl.h>
39 #include <sys/file.h>
40 #include <sys/limits.h>
41 #include <sys/lock.h>
42 #include <sys/mutex.h>
43 #include <sys/unistd.h>
44 #include <sys/posix4.h>
45 #include <sys/proc.h>
46 #include <sys/resourcevar.h>
47 #include <sys/signalvar.h>
48 #include <sys/protosw.h>
49 #include <sys/sema.h>
50 #include <sys/socket.h>
51 #include <sys/socketvar.h>
52 #include <sys/syscall.h>
53 #include <sys/sysent.h>
54 #include <sys/sysctl.h>
55 #include <sys/sx.h>
56 #include <sys/taskqueue.h>
57 #include <sys/vnode.h>
58 #include <sys/conf.h>
59 #include <sys/event.h>
60 #include <sys/mount.h>
61 
62 #include <machine/atomic.h>
63 
64 #include <vm/vm.h>
65 #include <vm/vm_extern.h>
66 #include <vm/pmap.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/uma.h>
70 #include <sys/aio.h>
71 
72 #include "opt_vfs_aio.h"
73 
74 /*
75  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
76  * overflow. (XXX will be removed soon.)
77  */
78 static u_long jobrefid;
79 
80 /*
81  * Counter for aio_fsync.
82  */
83 static uint64_t jobseqno;
84 
85 #define JOBST_NULL		0
86 #define JOBST_JOBQSOCK		1
87 #define JOBST_JOBQGLOBAL	2
88 #define JOBST_JOBRUNNING	3
89 #define JOBST_JOBFINISHED	4
90 #define JOBST_JOBQBUF		5
91 #define JOBST_JOBQSYNC		6
92 
93 #ifndef MAX_AIO_PER_PROC
94 #define MAX_AIO_PER_PROC	32
95 #endif
96 
97 #ifndef MAX_AIO_QUEUE_PER_PROC
98 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
99 #endif
100 
101 #ifndef MAX_AIO_PROCS
102 #define MAX_AIO_PROCS		32
103 #endif
104 
105 #ifndef MAX_AIO_QUEUE
106 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
107 #endif
108 
109 #ifndef TARGET_AIO_PROCS
110 #define TARGET_AIO_PROCS	4
111 #endif
112 
113 #ifndef MAX_BUF_AIO
114 #define MAX_BUF_AIO		16
115 #endif
116 
117 #ifndef AIOD_TIMEOUT_DEFAULT
118 #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
119 #endif
120 
121 #ifndef AIOD_LIFETIME_DEFAULT
122 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
123 #endif
124 
125 FEATURE(aio, "Asynchronous I/O");
126 
127 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
128 
129 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
130 
131 static int max_aio_procs = MAX_AIO_PROCS;
132 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
133 	CTLFLAG_RW, &max_aio_procs, 0,
134 	"Maximum number of kernel threads to use for handling async IO ");
135 
136 static int num_aio_procs = 0;
137 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
138 	CTLFLAG_RD, &num_aio_procs, 0,
139 	"Number of presently active kernel threads for async IO");
140 
141 /*
142  * The code will adjust the actual number of AIO processes towards this
143  * number when it gets a chance.
144  */
145 static int target_aio_procs = TARGET_AIO_PROCS;
146 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
147 	0, "Preferred number of ready kernel threads for async IO");
148 
149 static int max_queue_count = MAX_AIO_QUEUE;
150 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
151     "Maximum number of aio requests to queue, globally");
152 
153 static int num_queue_count = 0;
154 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
155     "Number of queued aio requests");
156 
157 static int num_buf_aio = 0;
158 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
159     "Number of aio requests presently handled by the buf subsystem");
160 
161 /* Number of async I/O thread in the process of being started */
162 /* XXX This should be local to aio_aqueue() */
163 static int num_aio_resv_start = 0;
164 
165 static int aiod_timeout;
166 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
167     "Timeout value for synchronous aio operations");
168 
169 static int aiod_lifetime;
170 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
171     "Maximum lifetime for idle aiod");
172 
173 static int unloadable = 0;
174 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
175     "Allow unload of aio (not recommended)");
176 
177 
178 static int max_aio_per_proc = MAX_AIO_PER_PROC;
179 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
180     0, "Maximum active aio requests per process (stored in the process)");
181 
182 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
183 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
184     &max_aio_queue_per_proc, 0,
185     "Maximum queued aio requests per process (stored in the process)");
186 
187 static int max_buf_aio = MAX_BUF_AIO;
188 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
189     "Maximum buf aio requests per process (stored in the process)");
190 
191 typedef struct oaiocb {
192 	int	aio_fildes;		/* File descriptor */
193 	off_t	aio_offset;		/* File offset for I/O */
194 	volatile void *aio_buf;         /* I/O buffer in process space */
195 	size_t	aio_nbytes;		/* Number of bytes for I/O */
196 	struct	osigevent aio_sigevent;	/* Signal to deliver */
197 	int	aio_lio_opcode;		/* LIO opcode */
198 	int	aio_reqprio;		/* Request priority -- ignored */
199 	struct	__aiocb_private	_aiocb_private;
200 } oaiocb_t;
201 
202 /*
203  * Below is a key of locks used to protect each member of struct aiocblist
204  * aioliojob and kaioinfo and any backends.
205  *
206  * * - need not protected
207  * a - locked by kaioinfo lock
208  * b - locked by backend lock, the backend lock can be null in some cases,
209  *     for example, BIO belongs to this type, in this case, proc lock is
210  *     reused.
211  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
212  */
213 
214 /*
215  * Current, there is only two backends: BIO and generic file I/O.
216  * socket I/O is served by generic file I/O, this is not a good idea, since
217  * disk file I/O and any other types without O_NONBLOCK flag can block daemon
218  * threads, if there is no thread to serve socket I/O, the socket I/O will be
219  * delayed too long or starved, we should create some threads dedicated to
220  * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
221  * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
222  * structure is not safe because there is race between userland and aio
223  * daemons.
224  */
225 
226 struct aiocblist {
227 	TAILQ_ENTRY(aiocblist) list;	/* (b) internal list of for backend */
228 	TAILQ_ENTRY(aiocblist) plist;	/* (a) list of jobs for each backend */
229 	TAILQ_ENTRY(aiocblist) allist;  /* (a) list of all jobs in proc */
230 	int	jobflags;		/* (a) job flags */
231 	int	jobstate;		/* (b) job state */
232 	int	inputcharge;		/* (*) input blockes */
233 	int	outputcharge;		/* (*) output blockes */
234 	struct	buf *bp;		/* (*) private to BIO backend,
235 				  	 * buffer pointer
236 					 */
237 	struct	proc *userproc;		/* (*) user process */
238 	struct  ucred *cred;		/* (*) active credential when created */
239 	struct	file *fd_file;		/* (*) pointer to file structure */
240 	struct	aioliojob *lio;		/* (*) optional lio job */
241 	struct	aiocb *uuaiocb;		/* (*) pointer in userspace of aiocb */
242 	struct	knlist klist;		/* (a) list of knotes */
243 	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
244 	ksiginfo_t ksi;			/* (a) realtime signal info */
245 	struct	task biotask;		/* (*) private to BIO backend */
246 	uint64_t seqno;			/* (*) job number */
247 	int	pending;		/* (a) number of pending I/O, aio_fsync only */
248 };
249 
250 /* jobflags */
251 #define AIOCBLIST_DONE		0x01
252 #define AIOCBLIST_BUFDONE	0x02
253 #define AIOCBLIST_RUNDOWN	0x04
254 #define AIOCBLIST_CHECKSYNC	0x08
255 
256 /*
257  * AIO process info
258  */
259 #define AIOP_FREE	0x1			/* proc on free queue */
260 
261 struct aiothreadlist {
262 	int aiothreadflags;			/* (c) AIO proc flags */
263 	TAILQ_ENTRY(aiothreadlist) list;	/* (c) list of processes */
264 	struct thread *aiothread;		/* (*) the AIO thread */
265 };
266 
267 /*
268  * data-structure for lio signal management
269  */
270 struct aioliojob {
271 	int	lioj_flags;			/* (a) listio flags */
272 	int	lioj_count;			/* (a) listio flags */
273 	int	lioj_finished_count;		/* (a) listio flags */
274 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
275 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
276 	struct  knlist klist;			/* (a) list of knotes */
277 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
278 };
279 
280 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
281 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
282 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
283 
284 /*
285  * per process aio data structure
286  */
287 struct kaioinfo {
288 	struct mtx	kaio_mtx;	/* the lock to protect this struct */
289 	int	kaio_flags;		/* (a) per process kaio flags */
290 	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
291 	int	kaio_active_count;	/* (c) number of currently used AIOs */
292 	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
293 	int	kaio_count;		/* (a) size of AIO queue */
294 	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
295 	int	kaio_buffer_count;	/* (a) number of physio buffers */
296 	TAILQ_HEAD(,aiocblist) kaio_all;	/* (a) all AIOs in the process */
297 	TAILQ_HEAD(,aiocblist) kaio_done;	/* (a) done queue for process */
298 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
299 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* (a) job queue for process */
300 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* (a) buffer job queue for process */
301 	TAILQ_HEAD(,aiocblist) kaio_sockqueue;  /* (a) queue for aios waiting on sockets,
302 						 *  NOT USED YET.
303 						 */
304 	TAILQ_HEAD(,aiocblist) kaio_syncqueue;	/* (a) queue for aio_fsync */
305 	struct	task	kaio_task;	/* (*) task to kick aio threads */
306 };
307 
308 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
309 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
310 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
311 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
312 
313 #define KAIO_RUNDOWN	0x1	/* process is being run down */
314 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
315 
316 /*
317  * Operations used to interact with userland aio control blocks.
318  * Different ABIs provide their own operations.
319  */
320 struct aiocb_ops {
321 	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
322 	long	(*fetch_status)(struct aiocb *ujob);
323 	long	(*fetch_error)(struct aiocb *ujob);
324 	int	(*store_status)(struct aiocb *ujob, long status);
325 	int	(*store_error)(struct aiocb *ujob, long error);
326 	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
327 	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
328 };
329 
330 static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* (c) Idle daemons */
331 static struct sema aio_newproc_sem;
332 static struct mtx aio_job_mtx;
333 static struct mtx aio_sock_mtx;
334 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* (c) Async job list */
335 static struct unrhdr *aiod_unr;
336 
337 void		aio_init_aioinfo(struct proc *p);
338 static int	aio_onceonly(void);
339 static int	aio_free_entry(struct aiocblist *aiocbe);
340 static void	aio_process(struct aiocblist *aiocbe);
341 static int	aio_newproc(int *);
342 int		aio_aqueue(struct thread *td, struct aiocb *job,
343 			struct aioliojob *lio, int type, struct aiocb_ops *ops);
344 static void	aio_physwakeup(struct buf *bp);
345 static void	aio_proc_rundown(void *arg, struct proc *p);
346 static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
347 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
348 static void	biohelper(void *, int);
349 static void	aio_daemon(void *param);
350 static void	aio_swake_cb(struct socket *, struct sockbuf *);
351 static int	aio_unload(void);
352 static void	aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
353 #define DONE_BUF	1
354 #define DONE_QUEUE	2
355 static int	aio_kick(struct proc *userp);
356 static void	aio_kick_nowait(struct proc *userp);
357 static void	aio_kick_helper(void *context, int pending);
358 static int	filt_aioattach(struct knote *kn);
359 static void	filt_aiodetach(struct knote *kn);
360 static int	filt_aio(struct knote *kn, long hint);
361 static int	filt_lioattach(struct knote *kn);
362 static void	filt_liodetach(struct knote *kn);
363 static int	filt_lio(struct knote *kn, long hint);
364 
365 /*
366  * Zones for:
367  * 	kaio	Per process async io info
368  *	aiop	async io thread data
369  *	aiocb	async io jobs
370  *	aiol	list io job pointer - internal to aio_suspend XXX
371  *	aiolio	list io jobs
372  */
373 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
374 
375 /* kqueue filters for aio */
376 static struct filterops aio_filtops = {
377 	.f_isfd = 0,
378 	.f_attach = filt_aioattach,
379 	.f_detach = filt_aiodetach,
380 	.f_event = filt_aio,
381 };
382 static struct filterops lio_filtops = {
383 	.f_isfd = 0,
384 	.f_attach = filt_lioattach,
385 	.f_detach = filt_liodetach,
386 	.f_event = filt_lio
387 };
388 
389 static eventhandler_tag exit_tag, exec_tag;
390 
391 TASKQUEUE_DEFINE_THREAD(aiod_bio);
392 
393 /*
394  * Main operations function for use as a kernel module.
395  */
396 static int
397 aio_modload(struct module *module, int cmd, void *arg)
398 {
399 	int error = 0;
400 
401 	switch (cmd) {
402 	case MOD_LOAD:
403 		aio_onceonly();
404 		break;
405 	case MOD_UNLOAD:
406 		error = aio_unload();
407 		break;
408 	case MOD_SHUTDOWN:
409 		break;
410 	default:
411 		error = EINVAL;
412 		break;
413 	}
414 	return (error);
415 }
416 
417 static moduledata_t aio_mod = {
418 	"aio",
419 	&aio_modload,
420 	NULL
421 };
422 
423 static struct syscall_helper_data aio_syscalls[] = {
424 	SYSCALL_INIT_HELPER(aio_cancel),
425 	SYSCALL_INIT_HELPER(aio_error),
426 	SYSCALL_INIT_HELPER(aio_fsync),
427 	SYSCALL_INIT_HELPER(aio_read),
428 	SYSCALL_INIT_HELPER(aio_return),
429 	SYSCALL_INIT_HELPER(aio_suspend),
430 	SYSCALL_INIT_HELPER(aio_waitcomplete),
431 	SYSCALL_INIT_HELPER(aio_write),
432 	SYSCALL_INIT_HELPER(lio_listio),
433 	SYSCALL_INIT_HELPER(oaio_read),
434 	SYSCALL_INIT_HELPER(oaio_write),
435 	SYSCALL_INIT_HELPER(olio_listio),
436 	SYSCALL_INIT_LAST
437 };
438 
439 #ifdef COMPAT_FREEBSD32
440 #include <sys/mount.h>
441 #include <sys/socket.h>
442 #include <compat/freebsd32/freebsd32.h>
443 #include <compat/freebsd32/freebsd32_proto.h>
444 #include <compat/freebsd32/freebsd32_signal.h>
445 #include <compat/freebsd32/freebsd32_syscall.h>
446 #include <compat/freebsd32/freebsd32_util.h>
447 
448 static struct syscall_helper_data aio32_syscalls[] = {
449 	SYSCALL32_INIT_HELPER(freebsd32_aio_return),
450 	SYSCALL32_INIT_HELPER(freebsd32_aio_suspend),
451 	SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
452 	SYSCALL32_INIT_HELPER(freebsd32_aio_error),
453 	SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
454 	SYSCALL32_INIT_HELPER(freebsd32_aio_read),
455 	SYSCALL32_INIT_HELPER(freebsd32_aio_write),
456 	SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
457 	SYSCALL32_INIT_HELPER(freebsd32_lio_listio),
458 	SYSCALL32_INIT_HELPER(freebsd32_oaio_read),
459 	SYSCALL32_INIT_HELPER(freebsd32_oaio_write),
460 	SYSCALL32_INIT_HELPER(freebsd32_olio_listio),
461 	SYSCALL_INIT_LAST
462 };
463 #endif
464 
465 DECLARE_MODULE(aio, aio_mod,
466 	SI_SUB_VFS, SI_ORDER_ANY);
467 MODULE_VERSION(aio, 1);
468 
469 /*
470  * Startup initialization
471  */
472 static int
473 aio_onceonly(void)
474 {
475 	int error;
476 
477 	/* XXX: should probably just use so->callback */
478 	aio_swake = &aio_swake_cb;
479 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
480 	    EVENTHANDLER_PRI_ANY);
481 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
482 	    EVENTHANDLER_PRI_ANY);
483 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
484 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
485 	TAILQ_INIT(&aio_freeproc);
486 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
487 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
488 	mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
489 	TAILQ_INIT(&aio_jobs);
490 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
491 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
492 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
493 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
494 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
495 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
496 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
497 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
498 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
499 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
500 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
501 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
502 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
503 	jobrefid = 1;
504 	async_io_version = _POSIX_VERSION;
505 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
506 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
507 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
508 
509 	error = syscall_helper_register(aio_syscalls);
510 	if (error)
511 		return (error);
512 #ifdef COMPAT_FREEBSD32
513 	error = syscall32_helper_register(aio32_syscalls);
514 	if (error)
515 		return (error);
516 #endif
517 	return (0);
518 }
519 
520 /*
521  * Callback for unload of AIO when used as a module.
522  */
523 static int
524 aio_unload(void)
525 {
526 	int error;
527 
528 	/*
529 	 * XXX: no unloads by default, it's too dangerous.
530 	 * perhaps we could do it if locked out callers and then
531 	 * did an aio_proc_rundown() on each process.
532 	 *
533 	 * jhb: aio_proc_rundown() needs to run on curproc though,
534 	 * so I don't think that would fly.
535 	 */
536 	if (!unloadable)
537 		return (EOPNOTSUPP);
538 
539 #ifdef COMPAT_FREEBSD32
540 	syscall32_helper_unregister(aio32_syscalls);
541 #endif
542 	syscall_helper_unregister(aio_syscalls);
543 
544 	error = kqueue_del_filteropts(EVFILT_AIO);
545 	if (error)
546 		return error;
547 	error = kqueue_del_filteropts(EVFILT_LIO);
548 	if (error)
549 		return error;
550 	async_io_version = 0;
551 	aio_swake = NULL;
552 	taskqueue_free(taskqueue_aiod_bio);
553 	delete_unrhdr(aiod_unr);
554 	uma_zdestroy(kaio_zone);
555 	uma_zdestroy(aiop_zone);
556 	uma_zdestroy(aiocb_zone);
557 	uma_zdestroy(aiol_zone);
558 	uma_zdestroy(aiolio_zone);
559 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
560 	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
561 	mtx_destroy(&aio_job_mtx);
562 	mtx_destroy(&aio_sock_mtx);
563 	sema_destroy(&aio_newproc_sem);
564 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
565 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
566 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
567 	return (0);
568 }
569 
570 /*
571  * Init the per-process aioinfo structure.  The aioinfo limits are set
572  * per-process for user limit (resource) management.
573  */
574 void
575 aio_init_aioinfo(struct proc *p)
576 {
577 	struct kaioinfo *ki;
578 
579 	ki = uma_zalloc(kaio_zone, M_WAITOK);
580 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
581 	ki->kaio_flags = 0;
582 	ki->kaio_maxactive_count = max_aio_per_proc;
583 	ki->kaio_active_count = 0;
584 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
585 	ki->kaio_count = 0;
586 	ki->kaio_ballowed_count = max_buf_aio;
587 	ki->kaio_buffer_count = 0;
588 	TAILQ_INIT(&ki->kaio_all);
589 	TAILQ_INIT(&ki->kaio_done);
590 	TAILQ_INIT(&ki->kaio_jobqueue);
591 	TAILQ_INIT(&ki->kaio_bufqueue);
592 	TAILQ_INIT(&ki->kaio_liojoblist);
593 	TAILQ_INIT(&ki->kaio_sockqueue);
594 	TAILQ_INIT(&ki->kaio_syncqueue);
595 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
596 	PROC_LOCK(p);
597 	if (p->p_aioinfo == NULL) {
598 		p->p_aioinfo = ki;
599 		PROC_UNLOCK(p);
600 	} else {
601 		PROC_UNLOCK(p);
602 		mtx_destroy(&ki->kaio_mtx);
603 		uma_zfree(kaio_zone, ki);
604 	}
605 
606 	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
607 		aio_newproc(NULL);
608 }
609 
610 static int
611 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
612 {
613 	struct thread *td;
614 	int error;
615 
616 	error = sigev_findtd(p, sigev, &td);
617 	if (error)
618 		return (error);
619 	if (!KSI_ONQ(ksi)) {
620 		ksiginfo_set_sigev(ksi, sigev);
621 		ksi->ksi_code = SI_ASYNCIO;
622 		ksi->ksi_flags |= KSI_EXT | KSI_INS;
623 		tdsendsignal(p, td, ksi->ksi_signo, ksi);
624 	}
625 	PROC_UNLOCK(p);
626 	return (error);
627 }
628 
629 /*
630  * Free a job entry.  Wait for completion if it is currently active, but don't
631  * delay forever.  If we delay, we return a flag that says that we have to
632  * restart the queue scan.
633  */
634 static int
635 aio_free_entry(struct aiocblist *aiocbe)
636 {
637 	struct kaioinfo *ki;
638 	struct aioliojob *lj;
639 	struct proc *p;
640 
641 	p = aiocbe->userproc;
642 	MPASS(curproc == p);
643 	ki = p->p_aioinfo;
644 	MPASS(ki != NULL);
645 
646 	AIO_LOCK_ASSERT(ki, MA_OWNED);
647 	MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
648 
649 	atomic_subtract_int(&num_queue_count, 1);
650 
651 	ki->kaio_count--;
652 	MPASS(ki->kaio_count >= 0);
653 
654 	TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
655 	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
656 
657 	lj = aiocbe->lio;
658 	if (lj) {
659 		lj->lioj_count--;
660 		lj->lioj_finished_count--;
661 
662 		if (lj->lioj_count == 0) {
663 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
664 			/* lio is going away, we need to destroy any knotes */
665 			knlist_delete(&lj->klist, curthread, 1);
666 			PROC_LOCK(p);
667 			sigqueue_take(&lj->lioj_ksi);
668 			PROC_UNLOCK(p);
669 			uma_zfree(aiolio_zone, lj);
670 		}
671 	}
672 
673 	/* aiocbe is going away, we need to destroy any knotes */
674 	knlist_delete(&aiocbe->klist, curthread, 1);
675 	PROC_LOCK(p);
676 	sigqueue_take(&aiocbe->ksi);
677 	PROC_UNLOCK(p);
678 
679 	MPASS(aiocbe->bp == NULL);
680 	aiocbe->jobstate = JOBST_NULL;
681 	AIO_UNLOCK(ki);
682 
683 	/*
684 	 * The thread argument here is used to find the owning process
685 	 * and is also passed to fo_close() which may pass it to various
686 	 * places such as devsw close() routines.  Because of that, we
687 	 * need a thread pointer from the process owning the job that is
688 	 * persistent and won't disappear out from under us or move to
689 	 * another process.
690 	 *
691 	 * Currently, all the callers of this function call it to remove
692 	 * an aiocblist from the current process' job list either via a
693 	 * syscall or due to the current process calling exit() or
694 	 * execve().  Thus, we know that p == curproc.  We also know that
695 	 * curthread can't exit since we are curthread.
696 	 *
697 	 * Therefore, we use curthread as the thread to pass to
698 	 * knlist_delete().  This does mean that it is possible for the
699 	 * thread pointer at close time to differ from the thread pointer
700 	 * at open time, but this is already true of file descriptors in
701 	 * a multithreaded process.
702 	 */
703 	fdrop(aiocbe->fd_file, curthread);
704 	crfree(aiocbe->cred);
705 	uma_zfree(aiocb_zone, aiocbe);
706 	AIO_LOCK(ki);
707 
708 	return (0);
709 }
710 
711 static void
712 aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
713 {
714    	aio_proc_rundown(arg, p);
715 }
716 
717 /*
718  * Rundown the jobs for a given process.
719  */
720 static void
721 aio_proc_rundown(void *arg, struct proc *p)
722 {
723 	struct kaioinfo *ki;
724 	struct aioliojob *lj;
725 	struct aiocblist *cbe, *cbn;
726 	struct file *fp;
727 	struct socket *so;
728 	int remove;
729 
730 	KASSERT(curthread->td_proc == p,
731 	    ("%s: called on non-curproc", __func__));
732 	ki = p->p_aioinfo;
733 	if (ki == NULL)
734 		return;
735 
736 	AIO_LOCK(ki);
737 	ki->kaio_flags |= KAIO_RUNDOWN;
738 
739 restart:
740 
741 	/*
742 	 * Try to cancel all pending requests. This code simulates
743 	 * aio_cancel on all pending I/O requests.
744 	 */
745 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
746 		remove = 0;
747 		mtx_lock(&aio_job_mtx);
748 		if (cbe->jobstate == JOBST_JOBQGLOBAL) {
749 			TAILQ_REMOVE(&aio_jobs, cbe, list);
750 			remove = 1;
751 		} else if (cbe->jobstate == JOBST_JOBQSOCK) {
752 			fp = cbe->fd_file;
753 			MPASS(fp->f_type == DTYPE_SOCKET);
754 			so = fp->f_data;
755 			TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
756 			remove = 1;
757 		} else if (cbe->jobstate == JOBST_JOBQSYNC) {
758 			TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
759 			remove = 1;
760 		}
761 		mtx_unlock(&aio_job_mtx);
762 
763 		if (remove) {
764 			cbe->jobstate = JOBST_JOBFINISHED;
765 			cbe->uaiocb._aiocb_private.status = -1;
766 			cbe->uaiocb._aiocb_private.error = ECANCELED;
767 			TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
768 			aio_bio_done_notify(p, cbe, DONE_QUEUE);
769 		}
770 	}
771 
772 	/* Wait for all running I/O to be finished */
773 	if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
774 	    TAILQ_FIRST(&ki->kaio_jobqueue)) {
775 		ki->kaio_flags |= KAIO_WAKEUP;
776 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
777 		goto restart;
778 	}
779 
780 	/* Free all completed I/O requests. */
781 	while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
782 		aio_free_entry(cbe);
783 
784 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
785 		if (lj->lioj_count == 0) {
786 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
787 			knlist_delete(&lj->klist, curthread, 1);
788 			PROC_LOCK(p);
789 			sigqueue_take(&lj->lioj_ksi);
790 			PROC_UNLOCK(p);
791 			uma_zfree(aiolio_zone, lj);
792 		} else {
793 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
794 			    lj->lioj_count, lj->lioj_finished_count);
795 		}
796 	}
797 	AIO_UNLOCK(ki);
798 	taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
799 	mtx_destroy(&ki->kaio_mtx);
800 	uma_zfree(kaio_zone, ki);
801 	p->p_aioinfo = NULL;
802 }
803 
804 /*
805  * Select a job to run (called by an AIO daemon).
806  */
807 static struct aiocblist *
808 aio_selectjob(struct aiothreadlist *aiop)
809 {
810 	struct aiocblist *aiocbe;
811 	struct kaioinfo *ki;
812 	struct proc *userp;
813 
814 	mtx_assert(&aio_job_mtx, MA_OWNED);
815 	TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
816 		userp = aiocbe->userproc;
817 		ki = userp->p_aioinfo;
818 
819 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
820 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
821 			/* Account for currently active jobs. */
822 			ki->kaio_active_count++;
823 			aiocbe->jobstate = JOBST_JOBRUNNING;
824 			break;
825 		}
826 	}
827 	return (aiocbe);
828 }
829 
830 /*
831  *  Move all data to a permanent storage device, this code
832  *  simulates fsync syscall.
833  */
834 static int
835 aio_fsync_vnode(struct thread *td, struct vnode *vp)
836 {
837 	struct mount *mp;
838 	int error;
839 
840 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
841 		goto drop;
842 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
843 	if (vp->v_object != NULL) {
844 		VM_OBJECT_LOCK(vp->v_object);
845 		vm_object_page_clean(vp->v_object, 0, 0, 0);
846 		VM_OBJECT_UNLOCK(vp->v_object);
847 	}
848 	error = VOP_FSYNC(vp, MNT_WAIT, td);
849 
850 	VOP_UNLOCK(vp, 0);
851 	vn_finished_write(mp);
852 drop:
853 	return (error);
854 }
855 
856 /*
857  * The AIO processing activity.  This is the code that does the I/O request for
858  * the non-physio version of the operations.  The normal vn operations are used,
859  * and this code should work in all instances for every type of file, including
860  * pipes, sockets, fifos, and regular files.
861  *
862  * XXX I don't think it works well for socket, pipe, and fifo.
863  */
864 static void
865 aio_process(struct aiocblist *aiocbe)
866 {
867 	struct ucred *td_savedcred;
868 	struct thread *td;
869 	struct aiocb *cb;
870 	struct file *fp;
871 	struct socket *so;
872 	struct uio auio;
873 	struct iovec aiov;
874 	int cnt;
875 	int error;
876 	int oublock_st, oublock_end;
877 	int inblock_st, inblock_end;
878 
879 	td = curthread;
880 	td_savedcred = td->td_ucred;
881 	td->td_ucred = aiocbe->cred;
882 	cb = &aiocbe->uaiocb;
883 	fp = aiocbe->fd_file;
884 
885 	if (cb->aio_lio_opcode == LIO_SYNC) {
886 		error = 0;
887 		cnt = 0;
888 		if (fp->f_vnode != NULL)
889 			error = aio_fsync_vnode(td, fp->f_vnode);
890 		cb->_aiocb_private.error = error;
891 		cb->_aiocb_private.status = 0;
892 		td->td_ucred = td_savedcred;
893 		return;
894 	}
895 
896 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
897 	aiov.iov_len = cb->aio_nbytes;
898 
899 	auio.uio_iov = &aiov;
900 	auio.uio_iovcnt = 1;
901 	auio.uio_offset = cb->aio_offset;
902 	auio.uio_resid = cb->aio_nbytes;
903 	cnt = cb->aio_nbytes;
904 	auio.uio_segflg = UIO_USERSPACE;
905 	auio.uio_td = td;
906 
907 	inblock_st = td->td_ru.ru_inblock;
908 	oublock_st = td->td_ru.ru_oublock;
909 	/*
910 	 * aio_aqueue() acquires a reference to the file that is
911 	 * released in aio_free_entry().
912 	 */
913 	if (cb->aio_lio_opcode == LIO_READ) {
914 		auio.uio_rw = UIO_READ;
915 		if (auio.uio_resid == 0)
916 			error = 0;
917 		else
918 			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
919 	} else {
920 		if (fp->f_type == DTYPE_VNODE)
921 			bwillwrite();
922 		auio.uio_rw = UIO_WRITE;
923 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
924 	}
925 	inblock_end = td->td_ru.ru_inblock;
926 	oublock_end = td->td_ru.ru_oublock;
927 
928 	aiocbe->inputcharge = inblock_end - inblock_st;
929 	aiocbe->outputcharge = oublock_end - oublock_st;
930 
931 	if ((error) && (auio.uio_resid != cnt)) {
932 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
933 			error = 0;
934 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
935 			int sigpipe = 1;
936 			if (fp->f_type == DTYPE_SOCKET) {
937 				so = fp->f_data;
938 				if (so->so_options & SO_NOSIGPIPE)
939 					sigpipe = 0;
940 			}
941 			if (sigpipe) {
942 				PROC_LOCK(aiocbe->userproc);
943 				kern_psignal(aiocbe->userproc, SIGPIPE);
944 				PROC_UNLOCK(aiocbe->userproc);
945 			}
946 		}
947 	}
948 
949 	cnt -= auio.uio_resid;
950 	cb->_aiocb_private.error = error;
951 	cb->_aiocb_private.status = cnt;
952 	td->td_ucred = td_savedcred;
953 }
954 
955 static void
956 aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
957 {
958 	struct aioliojob *lj;
959 	struct kaioinfo *ki;
960 	struct aiocblist *scb, *scbn;
961 	int lj_done;
962 
963 	ki = userp->p_aioinfo;
964 	AIO_LOCK_ASSERT(ki, MA_OWNED);
965 	lj = aiocbe->lio;
966 	lj_done = 0;
967 	if (lj) {
968 		lj->lioj_finished_count++;
969 		if (lj->lioj_count == lj->lioj_finished_count)
970 			lj_done = 1;
971 	}
972 	if (type == DONE_QUEUE) {
973 		aiocbe->jobflags |= AIOCBLIST_DONE;
974 	} else {
975 		aiocbe->jobflags |= AIOCBLIST_BUFDONE;
976 	}
977 	TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
978 	aiocbe->jobstate = JOBST_JOBFINISHED;
979 
980 	if (ki->kaio_flags & KAIO_RUNDOWN)
981 		goto notification_done;
982 
983 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
984 	    aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
985 		aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
986 
987 	KNOTE_LOCKED(&aiocbe->klist, 1);
988 
989 	if (lj_done) {
990 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
991 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
992 			KNOTE_LOCKED(&lj->klist, 1);
993 		}
994 		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
995 		    == LIOJ_SIGNAL
996 		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
997 		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
998 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
999 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
1000 		}
1001 	}
1002 
1003 notification_done:
1004 	if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
1005 		TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
1006 			if (aiocbe->fd_file == scb->fd_file &&
1007 			    aiocbe->seqno < scb->seqno) {
1008 				if (--scb->pending == 0) {
1009 					mtx_lock(&aio_job_mtx);
1010 					scb->jobstate = JOBST_JOBQGLOBAL;
1011 					TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
1012 					TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
1013 					aio_kick_nowait(userp);
1014 					mtx_unlock(&aio_job_mtx);
1015 				}
1016 			}
1017 		}
1018 	}
1019 	if (ki->kaio_flags & KAIO_WAKEUP) {
1020 		ki->kaio_flags &= ~KAIO_WAKEUP;
1021 		wakeup(&userp->p_aioinfo);
1022 	}
1023 }
1024 
1025 /*
1026  * The AIO daemon, most of the actual work is done in aio_process,
1027  * but the setup (and address space mgmt) is done in this routine.
1028  */
1029 static void
1030 aio_daemon(void *_id)
1031 {
1032 	struct aiocblist *aiocbe;
1033 	struct aiothreadlist *aiop;
1034 	struct kaioinfo *ki;
1035 	struct proc *curcp, *mycp, *userp;
1036 	struct vmspace *myvm, *tmpvm;
1037 	struct thread *td = curthread;
1038 	int id = (intptr_t)_id;
1039 
1040 	/*
1041 	 * Local copies of curproc (cp) and vmspace (myvm)
1042 	 */
1043 	mycp = td->td_proc;
1044 	myvm = mycp->p_vmspace;
1045 
1046 	KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));
1047 
1048 	/*
1049 	 * Allocate and ready the aio control info.  There is one aiop structure
1050 	 * per daemon.
1051 	 */
1052 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
1053 	aiop->aiothread = td;
1054 	aiop->aiothreadflags = 0;
1055 
1056 	/* The daemon resides in its own pgrp. */
1057 	sys_setsid(td, NULL);
1058 
1059 	/*
1060 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
1061 	 * and creating too many daemons.)
1062 	 */
1063 	sema_post(&aio_newproc_sem);
1064 
1065 	mtx_lock(&aio_job_mtx);
1066 	for (;;) {
1067 		/*
1068 		 * curcp is the current daemon process context.
1069 		 * userp is the current user process context.
1070 		 */
1071 		curcp = mycp;
1072 
1073 		/*
1074 		 * Take daemon off of free queue
1075 		 */
1076 		if (aiop->aiothreadflags & AIOP_FREE) {
1077 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1078 			aiop->aiothreadflags &= ~AIOP_FREE;
1079 		}
1080 
1081 		/*
1082 		 * Check for jobs.
1083 		 */
1084 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
1085 			mtx_unlock(&aio_job_mtx);
1086 			userp = aiocbe->userproc;
1087 
1088 			/*
1089 			 * Connect to process address space for user program.
1090 			 */
1091 			if (userp != curcp) {
1092 				/*
1093 				 * Save the current address space that we are
1094 				 * connected to.
1095 				 */
1096 				tmpvm = mycp->p_vmspace;
1097 
1098 				/*
1099 				 * Point to the new user address space, and
1100 				 * refer to it.
1101 				 */
1102 				mycp->p_vmspace = userp->p_vmspace;
1103 				atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);
1104 
1105 				/* Activate the new mapping. */
1106 				pmap_activate(FIRST_THREAD_IN_PROC(mycp));
1107 
1108 				/*
1109 				 * If the old address space wasn't the daemons
1110 				 * own address space, then we need to remove the
1111 				 * daemon's reference from the other process
1112 				 * that it was acting on behalf of.
1113 				 */
1114 				if (tmpvm != myvm) {
1115 					vmspace_free(tmpvm);
1116 				}
1117 				curcp = userp;
1118 			}
1119 
1120 			ki = userp->p_aioinfo;
1121 
1122 			/* Do the I/O function. */
1123 			aio_process(aiocbe);
1124 
1125 			mtx_lock(&aio_job_mtx);
1126 			/* Decrement the active job count. */
1127 			ki->kaio_active_count--;
1128 			mtx_unlock(&aio_job_mtx);
1129 
1130 			AIO_LOCK(ki);
1131 			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
1132 			aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
1133 			AIO_UNLOCK(ki);
1134 
1135 			mtx_lock(&aio_job_mtx);
1136 		}
1137 
1138 		/*
1139 		 * Disconnect from user address space.
1140 		 */
1141 		if (curcp != mycp) {
1142 
1143 			mtx_unlock(&aio_job_mtx);
1144 
1145 			/* Get the user address space to disconnect from. */
1146 			tmpvm = mycp->p_vmspace;
1147 
1148 			/* Get original address space for daemon. */
1149 			mycp->p_vmspace = myvm;
1150 
1151 			/* Activate the daemon's address space. */
1152 			pmap_activate(FIRST_THREAD_IN_PROC(mycp));
1153 #ifdef DIAGNOSTIC
1154 			if (tmpvm == myvm) {
1155 				printf("AIOD: vmspace problem -- %d\n",
1156 				    mycp->p_pid);
1157 			}
1158 #endif
1159 			/* Remove our vmspace reference. */
1160 			vmspace_free(tmpvm);
1161 
1162 			curcp = mycp;
1163 
1164 			mtx_lock(&aio_job_mtx);
1165 			/*
1166 			 * We have to restart to avoid race, we only sleep if
1167 			 * no job can be selected, that should be
1168 			 * curcp == mycp.
1169 			 */
1170 			continue;
1171 		}
1172 
1173 		mtx_assert(&aio_job_mtx, MA_OWNED);
1174 
1175 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
1176 		aiop->aiothreadflags |= AIOP_FREE;
1177 
1178 		/*
1179 		 * If daemon is inactive for a long time, allow it to exit,
1180 		 * thereby freeing resources.
1181 		 */
1182 		if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
1183 		    aiod_lifetime)) {
1184 			if (TAILQ_EMPTY(&aio_jobs)) {
1185 				if ((aiop->aiothreadflags & AIOP_FREE) &&
1186 				    (num_aio_procs > target_aio_procs)) {
1187 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
1188 					num_aio_procs--;
1189 					mtx_unlock(&aio_job_mtx);
1190 					uma_zfree(aiop_zone, aiop);
1191 					free_unr(aiod_unr, id);
1192 #ifdef DIAGNOSTIC
1193 					if (mycp->p_vmspace->vm_refcnt <= 1) {
1194 						printf("AIOD: bad vm refcnt for"
1195 						    " exiting daemon: %d\n",
1196 						    mycp->p_vmspace->vm_refcnt);
1197 					}
1198 #endif
1199 					kproc_exit(0);
1200 				}
1201 			}
1202 		}
1203 	}
1204 	mtx_unlock(&aio_job_mtx);
1205 	panic("shouldn't be here\n");
1206 }
1207 
1208 /*
1209  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
1210  * AIO daemon modifies its environment itself.
1211  */
1212 static int
1213 aio_newproc(int *start)
1214 {
1215 	int error;
1216 	struct proc *p;
1217 	int id;
1218 
1219 	id = alloc_unr(aiod_unr);
1220 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
1221 		RFNOWAIT, 0, "aiod%d", id);
1222 	if (error == 0) {
1223 		/*
1224 		 * Wait until daemon is started.
1225 		 */
1226 		sema_wait(&aio_newproc_sem);
1227 		mtx_lock(&aio_job_mtx);
1228 		num_aio_procs++;
1229 		if (start != NULL)
1230 			(*start)--;
1231 		mtx_unlock(&aio_job_mtx);
1232 	} else {
1233 		free_unr(aiod_unr, id);
1234 	}
1235 	return (error);
1236 }
1237 
1238 /*
1239  * Try the high-performance, low-overhead physio method for eligible
1240  * VCHR devices.  This method doesn't use an aio helper thread, and
1241  * thus has very low overhead.
1242  *
1243  * Assumes that the caller, aio_aqueue(), has incremented the file
1244  * structure's reference count, preventing its deallocation for the
1245  * duration of this call.
1246  */
1247 static int
1248 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1249 {
1250 	struct aiocb *cb;
1251 	struct file *fp;
1252 	struct buf *bp;
1253 	struct vnode *vp;
1254 	struct kaioinfo *ki;
1255 	struct aioliojob *lj;
1256 	int error;
1257 
1258 	cb = &aiocbe->uaiocb;
1259 	fp = aiocbe->fd_file;
1260 
1261 	if (fp->f_type != DTYPE_VNODE)
1262 		return (-1);
1263 
1264 	vp = fp->f_vnode;
1265 
1266 	/*
1267 	 * If its not a disk, we don't want to return a positive error.
1268 	 * It causes the aio code to not fall through to try the thread
1269 	 * way when you're talking to a regular file.
1270 	 */
1271 	if (!vn_isdisk(vp, &error)) {
1272 		if (error == ENOTBLK)
1273 			return (-1);
1274 		else
1275 			return (error);
1276 	}
1277 
1278 	if (vp->v_bufobj.bo_bsize == 0)
1279 		return (-1);
1280 
1281  	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
1282 		return (-1);
1283 
1284 	if (cb->aio_nbytes > vp->v_rdev->si_iosize_max)
1285 		return (-1);
1286 
1287 	if (cb->aio_nbytes >
1288 	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1289 		return (-1);
1290 
1291 	ki = p->p_aioinfo;
1292 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1293 		return (-1);
1294 
1295 	/* Create and build a buffer header for a transfer. */
1296 	bp = (struct buf *)getpbuf(NULL);
1297 	BUF_KERNPROC(bp);
1298 
1299 	AIO_LOCK(ki);
1300 	ki->kaio_count++;
1301 	ki->kaio_buffer_count++;
1302 	lj = aiocbe->lio;
1303 	if (lj)
1304 		lj->lioj_count++;
1305 	AIO_UNLOCK(ki);
1306 
1307 	/*
1308 	 * Get a copy of the kva from the physical buffer.
1309 	 */
1310 	error = 0;
1311 
1312 	bp->b_bcount = cb->aio_nbytes;
1313 	bp->b_bufsize = cb->aio_nbytes;
1314 	bp->b_iodone = aio_physwakeup;
1315 	bp->b_saveaddr = bp->b_data;
1316 	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1317 	bp->b_offset = cb->aio_offset;
1318 	bp->b_iooffset = cb->aio_offset;
1319 	bp->b_blkno = btodb(cb->aio_offset);
1320 	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
1321 
1322 	/*
1323 	 * Bring buffer into kernel space.
1324 	 */
1325 	if (vmapbuf(bp) < 0) {
1326 		error = EFAULT;
1327 		goto doerror;
1328 	}
1329 
1330 	AIO_LOCK(ki);
1331 	aiocbe->bp = bp;
1332 	bp->b_caller1 = (void *)aiocbe;
1333 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1334 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1335 	aiocbe->jobstate = JOBST_JOBQBUF;
1336 	cb->_aiocb_private.status = cb->aio_nbytes;
1337 	AIO_UNLOCK(ki);
1338 
1339 	atomic_add_int(&num_queue_count, 1);
1340 	atomic_add_int(&num_buf_aio, 1);
1341 
1342 	bp->b_error = 0;
1343 
1344 	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
1345 
1346 	/* Perform transfer. */
1347 	dev_strategy(vp->v_rdev, bp);
1348 	return (0);
1349 
1350 doerror:
1351 	AIO_LOCK(ki);
1352 	ki->kaio_count--;
1353 	ki->kaio_buffer_count--;
1354 	if (lj)
1355 		lj->lioj_count--;
1356 	aiocbe->bp = NULL;
1357 	AIO_UNLOCK(ki);
1358 	relpbuf(bp, NULL);
1359 	return (error);
1360 }
1361 
1362 /*
1363  * Wake up aio requests that may be serviceable now.
1364  */
1365 static void
1366 aio_swake_cb(struct socket *so, struct sockbuf *sb)
1367 {
1368 	struct aiocblist *cb, *cbn;
1369 	int opcode;
1370 
1371 	SOCKBUF_LOCK_ASSERT(sb);
1372 	if (sb == &so->so_snd)
1373 		opcode = LIO_WRITE;
1374 	else
1375 		opcode = LIO_READ;
1376 
1377 	sb->sb_flags &= ~SB_AIO;
1378 	mtx_lock(&aio_job_mtx);
1379 	TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
1380 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1381 			if (cb->jobstate != JOBST_JOBQSOCK)
1382 				panic("invalid queue value");
1383 			/* XXX
1384 			 * We don't have actual sockets backend yet,
1385 			 * so we simply move the requests to the generic
1386 			 * file I/O backend.
1387 			 */
1388 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1389 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1390 			aio_kick_nowait(cb->userproc);
1391 		}
1392 	}
1393 	mtx_unlock(&aio_job_mtx);
1394 }
1395 
1396 static int
1397 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
1398 {
1399 
1400 	/*
1401 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
1402 	 * supported by AIO with the old sigevent structure.
1403 	 */
1404 	nsig->sigev_notify = osig->sigev_notify;
1405 	switch (nsig->sigev_notify) {
1406 	case SIGEV_NONE:
1407 		break;
1408 	case SIGEV_SIGNAL:
1409 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
1410 		break;
1411 	case SIGEV_KEVENT:
1412 		nsig->sigev_notify_kqueue =
1413 		    osig->__sigev_u.__sigev_notify_kqueue;
1414 		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
1415 		break;
1416 	default:
1417 		return (EINVAL);
1418 	}
1419 	return (0);
1420 }
1421 
1422 static int
1423 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
1424 {
1425 	struct oaiocb *ojob;
1426 	int error;
1427 
1428 	bzero(kjob, sizeof(struct aiocb));
1429 	error = copyin(ujob, kjob, sizeof(struct oaiocb));
1430 	if (error)
1431 		return (error);
1432 	ojob = (struct oaiocb *)kjob;
1433 	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
1434 }
1435 
1436 static int
1437 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
1438 {
1439 
1440 	return (copyin(ujob, kjob, sizeof(struct aiocb)));
1441 }
1442 
1443 static long
1444 aiocb_fetch_status(struct aiocb *ujob)
1445 {
1446 
1447 	return (fuword(&ujob->_aiocb_private.status));
1448 }
1449 
1450 static long
1451 aiocb_fetch_error(struct aiocb *ujob)
1452 {
1453 
1454 	return (fuword(&ujob->_aiocb_private.error));
1455 }
1456 
1457 static int
1458 aiocb_store_status(struct aiocb *ujob, long status)
1459 {
1460 
1461 	return (suword(&ujob->_aiocb_private.status, status));
1462 }
1463 
1464 static int
1465 aiocb_store_error(struct aiocb *ujob, long error)
1466 {
1467 
1468 	return (suword(&ujob->_aiocb_private.error, error));
1469 }
1470 
1471 static int
1472 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
1473 {
1474 
1475 	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
1476 }
1477 
1478 static int
1479 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
1480 {
1481 
1482 	return (suword(ujobp, (long)ujob));
1483 }
1484 
1485 static struct aiocb_ops aiocb_ops = {
1486 	.copyin = aiocb_copyin,
1487 	.fetch_status = aiocb_fetch_status,
1488 	.fetch_error = aiocb_fetch_error,
1489 	.store_status = aiocb_store_status,
1490 	.store_error = aiocb_store_error,
1491 	.store_kernelinfo = aiocb_store_kernelinfo,
1492 	.store_aiocb = aiocb_store_aiocb,
1493 };
1494 
1495 static struct aiocb_ops aiocb_ops_osigevent = {
1496 	.copyin = aiocb_copyin_old_sigevent,
1497 	.fetch_status = aiocb_fetch_status,
1498 	.fetch_error = aiocb_fetch_error,
1499 	.store_status = aiocb_store_status,
1500 	.store_error = aiocb_store_error,
1501 	.store_kernelinfo = aiocb_store_kernelinfo,
1502 	.store_aiocb = aiocb_store_aiocb,
1503 };
1504 
1505 /*
1506  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1507  * technique is done in this code.
1508  */
1509 int
1510 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
1511 	int type, struct aiocb_ops *ops)
1512 {
1513 	struct proc *p = td->td_proc;
1514 	struct file *fp;
1515 	struct socket *so;
1516 	struct aiocblist *aiocbe, *cb;
1517 	struct kaioinfo *ki;
1518 	struct kevent kev;
1519 	struct sockbuf *sb;
1520 	int opcode;
1521 	int error;
1522 	int fd, kqfd;
1523 	int jid;
1524 	u_short evflags;
1525 
1526 	if (p->p_aioinfo == NULL)
1527 		aio_init_aioinfo(p);
1528 
1529 	ki = p->p_aioinfo;
1530 
1531 	ops->store_status(job, -1);
1532 	ops->store_error(job, 0);
1533 	ops->store_kernelinfo(job, -1);
1534 
1535 	if (num_queue_count >= max_queue_count ||
1536 	    ki->kaio_count >= ki->kaio_qallowed_count) {
1537 		ops->store_error(job, EAGAIN);
1538 		return (EAGAIN);
1539 	}
1540 
1541 	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
1542 	aiocbe->inputcharge = 0;
1543 	aiocbe->outputcharge = 0;
1544 	knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
1545 
1546 	error = ops->copyin(job, &aiocbe->uaiocb);
1547 	if (error) {
1548 		ops->store_error(job, error);
1549 		uma_zfree(aiocb_zone, aiocbe);
1550 		return (error);
1551 	}
1552 
1553 	/* XXX: aio_nbytes is later casted to signed types. */
1554 	if (aiocbe->uaiocb.aio_nbytes > INT_MAX) {
1555 		uma_zfree(aiocb_zone, aiocbe);
1556 		return (EINVAL);
1557 	}
1558 
1559 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
1560 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
1561 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
1562 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
1563 		ops->store_error(job, EINVAL);
1564 		uma_zfree(aiocb_zone, aiocbe);
1565 		return (EINVAL);
1566 	}
1567 
1568 	if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1569 	     aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
1570 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1571 		uma_zfree(aiocb_zone, aiocbe);
1572 		return (EINVAL);
1573 	}
1574 
1575 	ksiginfo_init(&aiocbe->ksi);
1576 
1577 	/* Save userspace address of the job info. */
1578 	aiocbe->uuaiocb = job;
1579 
1580 	/* Get the opcode. */
1581 	if (type != LIO_NOP)
1582 		aiocbe->uaiocb.aio_lio_opcode = type;
1583 	opcode = aiocbe->uaiocb.aio_lio_opcode;
1584 
1585 	/*
1586 	 * Validate the opcode and fetch the file object for the specified
1587 	 * file descriptor.
1588 	 *
1589 	 * XXXRW: Moved the opcode validation up here so that we don't
1590 	 * retrieve a file descriptor without knowing what the capabiltity
1591 	 * should be.
1592 	 */
1593 	fd = aiocbe->uaiocb.aio_fildes;
1594 	switch (opcode) {
1595 	case LIO_WRITE:
1596 		error = fget_write(td, fd, CAP_WRITE | CAP_SEEK, &fp);
1597 		break;
1598 	case LIO_READ:
1599 		error = fget_read(td, fd, CAP_READ | CAP_SEEK, &fp);
1600 		break;
1601 	case LIO_SYNC:
1602 		error = fget(td, fd, CAP_FSYNC, &fp);
1603 		break;
1604 	case LIO_NOP:
1605 		error = fget(td, fd, 0, &fp);
1606 		break;
1607 	default:
1608 		error = EINVAL;
1609 	}
1610 	if (error) {
1611 		uma_zfree(aiocb_zone, aiocbe);
1612 		ops->store_error(job, error);
1613 		return (error);
1614 	}
1615 
1616 	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
1617 		error = EINVAL;
1618 		goto aqueue_fail;
1619 	}
1620 
1621 	if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
1622 		error = EINVAL;
1623 		goto aqueue_fail;
1624 	}
1625 
1626 	aiocbe->fd_file = fp;
1627 
1628 	mtx_lock(&aio_job_mtx);
1629 	jid = jobrefid++;
1630 	aiocbe->seqno = jobseqno++;
1631 	mtx_unlock(&aio_job_mtx);
1632 	error = ops->store_kernelinfo(job, jid);
1633 	if (error) {
1634 		error = EINVAL;
1635 		goto aqueue_fail;
1636 	}
1637 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
1638 
1639 	if (opcode == LIO_NOP) {
1640 		fdrop(fp, td);
1641 		uma_zfree(aiocb_zone, aiocbe);
1642 		return (0);
1643 	}
1644 
1645 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
1646 		goto no_kqueue;
1647 	evflags = aiocbe->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
1648 	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
1649 		error = EINVAL;
1650 		goto aqueue_fail;
1651 	}
1652 	kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1653 	kev.ident = (uintptr_t)aiocbe->uuaiocb;
1654 	kev.filter = EVFILT_AIO;
1655 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
1656 	kev.data = (intptr_t)aiocbe;
1657 	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
1658 	error = kqfd_register(kqfd, &kev, td, 1);
1659 aqueue_fail:
1660 	if (error) {
1661 		fdrop(fp, td);
1662 		uma_zfree(aiocb_zone, aiocbe);
1663 		ops->store_error(job, error);
1664 		goto done;
1665 	}
1666 no_kqueue:
1667 
1668 	ops->store_error(job, EINPROGRESS);
1669 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1670 	aiocbe->userproc = p;
1671 	aiocbe->cred = crhold(td->td_ucred);
1672 	aiocbe->jobflags = 0;
1673 	aiocbe->lio = lj;
1674 
1675 	if (opcode == LIO_SYNC)
1676 		goto queueit;
1677 
1678 	if (fp->f_type == DTYPE_SOCKET) {
1679 		/*
1680 		 * Alternate queueing for socket ops: Reach down into the
1681 		 * descriptor to get the socket data.  Then check to see if the
1682 		 * socket is ready to be read or written (based on the requested
1683 		 * operation).
1684 		 *
1685 		 * If it is not ready for io, then queue the aiocbe on the
1686 		 * socket, and set the flags so we get a call when sbnotify()
1687 		 * happens.
1688 		 *
1689 		 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock
1690 		 * and unlock the snd sockbuf for no reason.
1691 		 */
1692 		so = fp->f_data;
1693 		sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
1694 		SOCKBUF_LOCK(sb);
1695 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1696 		    LIO_WRITE) && (!sowriteable(so)))) {
1697 			sb->sb_flags |= SB_AIO;
1698 
1699 			mtx_lock(&aio_job_mtx);
1700 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1701 			mtx_unlock(&aio_job_mtx);
1702 
1703 			AIO_LOCK(ki);
1704 			TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1705 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1706 			aiocbe->jobstate = JOBST_JOBQSOCK;
1707 			ki->kaio_count++;
1708 			if (lj)
1709 				lj->lioj_count++;
1710 			AIO_UNLOCK(ki);
1711 			SOCKBUF_UNLOCK(sb);
1712 			atomic_add_int(&num_queue_count, 1);
1713 			error = 0;
1714 			goto done;
1715 		}
1716 		SOCKBUF_UNLOCK(sb);
1717 	}
1718 
1719 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1720 		goto done;
1721 #if 0
1722 	if (error > 0) {
1723 		aiocbe->uaiocb._aiocb_private.error = error;
1724 		ops->store_error(job, error);
1725 		goto done;
1726 	}
1727 #endif
1728 queueit:
1729 	/* No buffer for daemon I/O. */
1730 	aiocbe->bp = NULL;
1731 	atomic_add_int(&num_queue_count, 1);
1732 
1733 	AIO_LOCK(ki);
1734 	ki->kaio_count++;
1735 	if (lj)
1736 		lj->lioj_count++;
1737 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1738 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1739 	if (opcode == LIO_SYNC) {
1740 		TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
1741 			if (cb->fd_file == aiocbe->fd_file &&
1742 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
1743 			    cb->seqno < aiocbe->seqno) {
1744 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
1745 				aiocbe->pending++;
1746 			}
1747 		}
1748 		TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
1749 			if (cb->fd_file == aiocbe->fd_file &&
1750 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
1751 			    cb->seqno < aiocbe->seqno) {
1752 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
1753 				aiocbe->pending++;
1754 			}
1755 		}
1756 		if (aiocbe->pending != 0) {
1757 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
1758 			aiocbe->jobstate = JOBST_JOBQSYNC;
1759 			AIO_UNLOCK(ki);
1760 			goto done;
1761 		}
1762 	}
1763 	mtx_lock(&aio_job_mtx);
1764 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1765 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1766 	aio_kick_nowait(p);
1767 	mtx_unlock(&aio_job_mtx);
1768 	AIO_UNLOCK(ki);
1769 	error = 0;
1770 done:
1771 	return (error);
1772 }
1773 
1774 static void
1775 aio_kick_nowait(struct proc *userp)
1776 {
1777 	struct kaioinfo *ki = userp->p_aioinfo;
1778 	struct aiothreadlist *aiop;
1779 
1780 	mtx_assert(&aio_job_mtx, MA_OWNED);
1781 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1782 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1783 		aiop->aiothreadflags &= ~AIOP_FREE;
1784 		wakeup(aiop->aiothread);
1785 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1786 	    ((ki->kaio_active_count + num_aio_resv_start) <
1787 	    ki->kaio_maxactive_count)) {
1788 		taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
1789 	}
1790 }
1791 
1792 static int
1793 aio_kick(struct proc *userp)
1794 {
1795 	struct kaioinfo *ki = userp->p_aioinfo;
1796 	struct aiothreadlist *aiop;
1797 	int error, ret = 0;
1798 
1799 	mtx_assert(&aio_job_mtx, MA_OWNED);
1800 retryproc:
1801 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1802 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1803 		aiop->aiothreadflags &= ~AIOP_FREE;
1804 		wakeup(aiop->aiothread);
1805 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1806 	    ((ki->kaio_active_count + num_aio_resv_start) <
1807 	    ki->kaio_maxactive_count)) {
1808 		num_aio_resv_start++;
1809 		mtx_unlock(&aio_job_mtx);
1810 		error = aio_newproc(&num_aio_resv_start);
1811 		mtx_lock(&aio_job_mtx);
1812 		if (error) {
1813 			num_aio_resv_start--;
1814 			goto retryproc;
1815 		}
1816 	} else {
1817 		ret = -1;
1818 	}
1819 	return (ret);
1820 }
1821 
1822 static void
1823 aio_kick_helper(void *context, int pending)
1824 {
1825 	struct proc *userp = context;
1826 
1827 	mtx_lock(&aio_job_mtx);
1828 	while (--pending >= 0) {
1829 		if (aio_kick(userp))
1830 			break;
1831 	}
1832 	mtx_unlock(&aio_job_mtx);
1833 }
1834 
1835 /*
1836  * Support the aio_return system call, as a side-effect, kernel resources are
1837  * released.
1838  */
1839 static int
1840 kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops)
1841 {
1842 	struct proc *p = td->td_proc;
1843 	struct aiocblist *cb;
1844 	struct kaioinfo *ki;
1845 	int status, error;
1846 
1847 	ki = p->p_aioinfo;
1848 	if (ki == NULL)
1849 		return (EINVAL);
1850 	AIO_LOCK(ki);
1851 	TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
1852 		if (cb->uuaiocb == uaiocb)
1853 			break;
1854 	}
1855 	if (cb != NULL) {
1856 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
1857 		status = cb->uaiocb._aiocb_private.status;
1858 		error = cb->uaiocb._aiocb_private.error;
1859 		td->td_retval[0] = status;
1860 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1861 			td->td_ru.ru_oublock += cb->outputcharge;
1862 			cb->outputcharge = 0;
1863 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1864 			td->td_ru.ru_inblock += cb->inputcharge;
1865 			cb->inputcharge = 0;
1866 		}
1867 		aio_free_entry(cb);
1868 		AIO_UNLOCK(ki);
1869 		ops->store_error(uaiocb, error);
1870 		ops->store_status(uaiocb, status);
1871 	} else {
1872 		error = EINVAL;
1873 		AIO_UNLOCK(ki);
1874 	}
1875 	return (error);
1876 }
1877 
1878 int
1879 sys_aio_return(struct thread *td, struct aio_return_args *uap)
1880 {
1881 
1882 	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
1883 }
1884 
1885 /*
1886  * Allow a process to wakeup when any of the I/O requests are completed.
1887  */
1888 static int
1889 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
1890     struct timespec *ts)
1891 {
1892 	struct proc *p = td->td_proc;
1893 	struct timeval atv;
1894 	struct kaioinfo *ki;
1895 	struct aiocblist *cb, *cbfirst;
1896 	int error, i, timo;
1897 
1898 	timo = 0;
1899 	if (ts) {
1900 		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
1901 			return (EINVAL);
1902 
1903 		TIMESPEC_TO_TIMEVAL(&atv, ts);
1904 		if (itimerfix(&atv))
1905 			return (EINVAL);
1906 		timo = tvtohz(&atv);
1907 	}
1908 
1909 	ki = p->p_aioinfo;
1910 	if (ki == NULL)
1911 		return (EAGAIN);
1912 
1913 	if (njoblist == 0)
1914 		return (0);
1915 
1916 	AIO_LOCK(ki);
1917 	for (;;) {
1918 		cbfirst = NULL;
1919 		error = 0;
1920 		TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
1921 			for (i = 0; i < njoblist; i++) {
1922 				if (cb->uuaiocb == ujoblist[i]) {
1923 					if (cbfirst == NULL)
1924 						cbfirst = cb;
1925 					if (cb->jobstate == JOBST_JOBFINISHED)
1926 						goto RETURN;
1927 				}
1928 			}
1929 		}
1930 		/* All tasks were finished. */
1931 		if (cbfirst == NULL)
1932 			break;
1933 
1934 		ki->kaio_flags |= KAIO_WAKEUP;
1935 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
1936 		    "aiospn", timo);
1937 		if (error == ERESTART)
1938 			error = EINTR;
1939 		if (error)
1940 			break;
1941 	}
1942 RETURN:
1943 	AIO_UNLOCK(ki);
1944 	return (error);
1945 }
1946 
1947 int
1948 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1949 {
1950 	struct timespec ts, *tsp;
1951 	struct aiocb **ujoblist;
1952 	int error;
1953 
1954 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
1955 		return (EINVAL);
1956 
1957 	if (uap->timeout) {
1958 		/* Get timespec struct. */
1959 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1960 			return (error);
1961 		tsp = &ts;
1962 	} else
1963 		tsp = NULL;
1964 
1965 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
1966 	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
1967 	if (error == 0)
1968 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
1969 	uma_zfree(aiol_zone, ujoblist);
1970 	return (error);
1971 }
1972 
1973 /*
1974  * aio_cancel cancels any non-physio aio operations not currently in
1975  * progress.
1976  */
1977 int
1978 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1979 {
1980 	struct proc *p = td->td_proc;
1981 	struct kaioinfo *ki;
1982 	struct aiocblist *cbe, *cbn;
1983 	struct file *fp;
1984 	struct socket *so;
1985 	int error;
1986 	int remove;
1987 	int cancelled = 0;
1988 	int notcancelled = 0;
1989 	struct vnode *vp;
1990 
1991 	/* Lookup file object. */
1992 	error = fget(td, uap->fd, 0, &fp);
1993 	if (error)
1994 		return (error);
1995 
1996 	ki = p->p_aioinfo;
1997 	if (ki == NULL)
1998 		goto done;
1999 
2000 	if (fp->f_type == DTYPE_VNODE) {
2001 		vp = fp->f_vnode;
2002 		if (vn_isdisk(vp, &error)) {
2003 			fdrop(fp, td);
2004 			td->td_retval[0] = AIO_NOTCANCELED;
2005 			return (0);
2006 		}
2007 	}
2008 
2009 	AIO_LOCK(ki);
2010 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
2011 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
2012 		    ((uap->aiocbp == NULL) ||
2013 		     (uap->aiocbp == cbe->uuaiocb))) {
2014 			remove = 0;
2015 
2016 			mtx_lock(&aio_job_mtx);
2017 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
2018 				TAILQ_REMOVE(&aio_jobs, cbe, list);
2019 				remove = 1;
2020 			} else if (cbe->jobstate == JOBST_JOBQSOCK) {
2021 				MPASS(fp->f_type == DTYPE_SOCKET);
2022 				so = fp->f_data;
2023 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
2024 				remove = 1;
2025 			} else if (cbe->jobstate == JOBST_JOBQSYNC) {
2026 				TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
2027 				remove = 1;
2028 			}
2029 			mtx_unlock(&aio_job_mtx);
2030 
2031 			if (remove) {
2032 				TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
2033 				cbe->uaiocb._aiocb_private.status = -1;
2034 				cbe->uaiocb._aiocb_private.error = ECANCELED;
2035 				aio_bio_done_notify(p, cbe, DONE_QUEUE);
2036 				cancelled++;
2037 			} else {
2038 				notcancelled++;
2039 			}
2040 			if (uap->aiocbp != NULL)
2041 				break;
2042 		}
2043 	}
2044 	AIO_UNLOCK(ki);
2045 
2046 done:
2047 	fdrop(fp, td);
2048 
2049 	if (uap->aiocbp != NULL) {
2050 		if (cancelled) {
2051 			td->td_retval[0] = AIO_CANCELED;
2052 			return (0);
2053 		}
2054 	}
2055 
2056 	if (notcancelled) {
2057 		td->td_retval[0] = AIO_NOTCANCELED;
2058 		return (0);
2059 	}
2060 
2061 	if (cancelled) {
2062 		td->td_retval[0] = AIO_CANCELED;
2063 		return (0);
2064 	}
2065 
2066 	td->td_retval[0] = AIO_ALLDONE;
2067 
2068 	return (0);
2069 }
2070 
2071 /*
2072  * aio_error is implemented in the kernel level for compatibility purposes
2073  * only.  For a user mode async implementation, it would be best to do it in
2074  * a userland subroutine.
2075  */
2076 static int
2077 kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops)
2078 {
2079 	struct proc *p = td->td_proc;
2080 	struct aiocblist *cb;
2081 	struct kaioinfo *ki;
2082 	int status;
2083 
2084 	ki = p->p_aioinfo;
2085 	if (ki == NULL) {
2086 		td->td_retval[0] = EINVAL;
2087 		return (0);
2088 	}
2089 
2090 	AIO_LOCK(ki);
2091 	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
2092 		if (cb->uuaiocb == aiocbp) {
2093 			if (cb->jobstate == JOBST_JOBFINISHED)
2094 				td->td_retval[0] =
2095 					cb->uaiocb._aiocb_private.error;
2096 			else
2097 				td->td_retval[0] = EINPROGRESS;
2098 			AIO_UNLOCK(ki);
2099 			return (0);
2100 		}
2101 	}
2102 	AIO_UNLOCK(ki);
2103 
2104 	/*
2105 	 * Hack for failure of aio_aqueue.
2106 	 */
2107 	status = ops->fetch_status(aiocbp);
2108 	if (status == -1) {
2109 		td->td_retval[0] = ops->fetch_error(aiocbp);
2110 		return (0);
2111 	}
2112 
2113 	td->td_retval[0] = EINVAL;
2114 	return (0);
2115 }
2116 
2117 int
2118 sys_aio_error(struct thread *td, struct aio_error_args *uap)
2119 {
2120 
2121 	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
2122 }
2123 
2124 /* syscall - asynchronous read from a file (REALTIME) */
2125 int
2126 sys_oaio_read(struct thread *td, struct oaio_read_args *uap)
2127 {
2128 
2129 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2130 	    &aiocb_ops_osigevent));
2131 }
2132 
2133 int
2134 sys_aio_read(struct thread *td, struct aio_read_args *uap)
2135 {
2136 
2137 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
2138 }
2139 
2140 /* syscall - asynchronous write to a file (REALTIME) */
2141 int
2142 sys_oaio_write(struct thread *td, struct oaio_write_args *uap)
2143 {
2144 
2145 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2146 	    &aiocb_ops_osigevent));
2147 }
2148 
2149 int
2150 sys_aio_write(struct thread *td, struct aio_write_args *uap)
2151 {
2152 
2153 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
2154 }
2155 
2156 static int
2157 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
2158     struct aiocb **acb_list, int nent, struct sigevent *sig,
2159     struct aiocb_ops *ops)
2160 {
2161 	struct proc *p = td->td_proc;
2162 	struct aiocb *iocb;
2163 	struct kaioinfo *ki;
2164 	struct aioliojob *lj;
2165 	struct kevent kev;
2166 	int error;
2167 	int nerror;
2168 	int i;
2169 
2170 	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
2171 		return (EINVAL);
2172 
2173 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2174 		return (EINVAL);
2175 
2176 	if (p->p_aioinfo == NULL)
2177 		aio_init_aioinfo(p);
2178 
2179 	ki = p->p_aioinfo;
2180 
2181 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
2182 	lj->lioj_flags = 0;
2183 	lj->lioj_count = 0;
2184 	lj->lioj_finished_count = 0;
2185 	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
2186 	ksiginfo_init(&lj->lioj_ksi);
2187 
2188 	/*
2189 	 * Setup signal.
2190 	 */
2191 	if (sig && (mode == LIO_NOWAIT)) {
2192 		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
2193 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2194 			/* Assume only new style KEVENT */
2195 			kev.filter = EVFILT_LIO;
2196 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
2197 			kev.ident = (uintptr_t)uacb_list; /* something unique */
2198 			kev.data = (intptr_t)lj;
2199 			/* pass user defined sigval data */
2200 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
2201 			error = kqfd_register(
2202 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
2203 			if (error) {
2204 				uma_zfree(aiolio_zone, lj);
2205 				return (error);
2206 			}
2207 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
2208 			;
2209 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2210 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
2211 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2212 					uma_zfree(aiolio_zone, lj);
2213 					return EINVAL;
2214 				}
2215 				lj->lioj_flags |= LIOJ_SIGNAL;
2216 		} else {
2217 			uma_zfree(aiolio_zone, lj);
2218 			return EINVAL;
2219 		}
2220 	}
2221 
2222 	AIO_LOCK(ki);
2223 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2224 	/*
2225 	 * Add extra aiocb count to avoid the lio to be freed
2226 	 * by other threads doing aio_waitcomplete or aio_return,
2227 	 * and prevent event from being sent until we have queued
2228 	 * all tasks.
2229 	 */
2230 	lj->lioj_count = 1;
2231 	AIO_UNLOCK(ki);
2232 
2233 	/*
2234 	 * Get pointers to the list of I/O requests.
2235 	 */
2236 	nerror = 0;
2237 	for (i = 0; i < nent; i++) {
2238 		iocb = acb_list[i];
2239 		if (iocb != NULL) {
2240 			error = aio_aqueue(td, iocb, lj, LIO_NOP, ops);
2241 			if (error != 0)
2242 				nerror++;
2243 		}
2244 	}
2245 
2246 	error = 0;
2247 	AIO_LOCK(ki);
2248 	if (mode == LIO_WAIT) {
2249 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
2250 			ki->kaio_flags |= KAIO_WAKEUP;
2251 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
2252 			    PRIBIO | PCATCH, "aiospn", 0);
2253 			if (error == ERESTART)
2254 				error = EINTR;
2255 			if (error)
2256 				break;
2257 		}
2258 	} else {
2259 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
2260 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2261 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
2262 				KNOTE_LOCKED(&lj->klist, 1);
2263 			}
2264 			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
2265 			    == LIOJ_SIGNAL
2266 			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2267 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
2268 				aio_sendsig(p, &lj->lioj_signal,
2269 					    &lj->lioj_ksi);
2270 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2271 			}
2272 		}
2273 	}
2274 	lj->lioj_count--;
2275 	if (lj->lioj_count == 0) {
2276 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
2277 		knlist_delete(&lj->klist, curthread, 1);
2278 		PROC_LOCK(p);
2279 		sigqueue_take(&lj->lioj_ksi);
2280 		PROC_UNLOCK(p);
2281 		AIO_UNLOCK(ki);
2282 		uma_zfree(aiolio_zone, lj);
2283 	} else
2284 		AIO_UNLOCK(ki);
2285 
2286 	if (nerror)
2287 		return (EIO);
2288 	return (error);
2289 }
2290 
2291 /* syscall - list directed I/O (REALTIME) */
2292 int
2293 sys_olio_listio(struct thread *td, struct olio_listio_args *uap)
2294 {
2295 	struct aiocb **acb_list;
2296 	struct sigevent *sigp, sig;
2297 	struct osigevent osig;
2298 	int error, nent;
2299 
2300 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2301 		return (EINVAL);
2302 
2303 	nent = uap->nent;
2304 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2305 		return (EINVAL);
2306 
2307 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2308 		error = copyin(uap->sig, &osig, sizeof(osig));
2309 		if (error)
2310 			return (error);
2311 		error = convert_old_sigevent(&osig, &sig);
2312 		if (error)
2313 			return (error);
2314 		sigp = &sig;
2315 	} else
2316 		sigp = NULL;
2317 
2318 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2319 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2320 	if (error == 0)
2321 		error = kern_lio_listio(td, uap->mode,
2322 		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2323 		    &aiocb_ops_osigevent);
2324 	free(acb_list, M_LIO);
2325 	return (error);
2326 }
2327 
2328 /* syscall - list directed I/O (REALTIME) */
2329 int
2330 sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
2331 {
2332 	struct aiocb **acb_list;
2333 	struct sigevent *sigp, sig;
2334 	int error, nent;
2335 
2336 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2337 		return (EINVAL);
2338 
2339 	nent = uap->nent;
2340 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2341 		return (EINVAL);
2342 
2343 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2344 		error = copyin(uap->sig, &sig, sizeof(sig));
2345 		if (error)
2346 			return (error);
2347 		sigp = &sig;
2348 	} else
2349 		sigp = NULL;
2350 
2351 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2352 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2353 	if (error == 0)
2354 		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
2355 		    nent, sigp, &aiocb_ops);
2356 	free(acb_list, M_LIO);
2357 	return (error);
2358 }
2359 
2360 /*
2361  * Called from interrupt thread for physio, we should return as fast
2362  * as possible, so we schedule a biohelper task.
2363  */
2364 static void
2365 aio_physwakeup(struct buf *bp)
2366 {
2367 	struct aiocblist *aiocbe;
2368 
2369 	aiocbe = (struct aiocblist *)bp->b_caller1;
2370 	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
2371 }
2372 
2373 /*
2374  * Task routine to perform heavy tasks, process wakeup, and signals.
2375  */
2376 static void
2377 biohelper(void *context, int pending)
2378 {
2379 	struct aiocblist *aiocbe = context;
2380 	struct buf *bp;
2381 	struct proc *userp;
2382 	struct kaioinfo *ki;
2383 	int nblks;
2384 
2385 	bp = aiocbe->bp;
2386 	userp = aiocbe->userproc;
2387 	ki = userp->p_aioinfo;
2388 	AIO_LOCK(ki);
2389 	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2390 	aiocbe->uaiocb._aiocb_private.error = 0;
2391 	if (bp->b_ioflags & BIO_ERROR)
2392 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2393 	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
2394 	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
2395 		aiocbe->outputcharge += nblks;
2396 	else
2397 		aiocbe->inputcharge += nblks;
2398 	aiocbe->bp = NULL;
2399 	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
2400 	ki->kaio_buffer_count--;
2401 	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
2402 	AIO_UNLOCK(ki);
2403 
2404 	/* Release mapping into kernel space. */
2405 	vunmapbuf(bp);
2406 	relpbuf(bp, NULL);
2407 	atomic_subtract_int(&num_buf_aio, 1);
2408 }
2409 
2410 /* syscall - wait for the next completion of an aio request */
2411 static int
2412 kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp,
2413     struct timespec *ts, struct aiocb_ops *ops)
2414 {
2415 	struct proc *p = td->td_proc;
2416 	struct timeval atv;
2417 	struct kaioinfo *ki;
2418 	struct aiocblist *cb;
2419 	struct aiocb *uuaiocb;
2420 	int error, status, timo;
2421 
2422 	ops->store_aiocb(aiocbp, NULL);
2423 
2424 	timo = 0;
2425 	if (ts) {
2426 		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
2427 			return (EINVAL);
2428 
2429 		TIMESPEC_TO_TIMEVAL(&atv, ts);
2430 		if (itimerfix(&atv))
2431 			return (EINVAL);
2432 		timo = tvtohz(&atv);
2433 	}
2434 
2435 	if (p->p_aioinfo == NULL)
2436 		aio_init_aioinfo(p);
2437 	ki = p->p_aioinfo;
2438 
2439 	error = 0;
2440 	cb = NULL;
2441 	AIO_LOCK(ki);
2442 	while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
2443 		ki->kaio_flags |= KAIO_WAKEUP;
2444 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2445 		    "aiowc", timo);
2446 		if (timo && error == ERESTART)
2447 			error = EINTR;
2448 		if (error)
2449 			break;
2450 	}
2451 
2452 	if (cb != NULL) {
2453 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
2454 		uuaiocb = cb->uuaiocb;
2455 		status = cb->uaiocb._aiocb_private.status;
2456 		error = cb->uaiocb._aiocb_private.error;
2457 		td->td_retval[0] = status;
2458 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2459 			td->td_ru.ru_oublock += cb->outputcharge;
2460 			cb->outputcharge = 0;
2461 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2462 			td->td_ru.ru_inblock += cb->inputcharge;
2463 			cb->inputcharge = 0;
2464 		}
2465 		aio_free_entry(cb);
2466 		AIO_UNLOCK(ki);
2467 		ops->store_aiocb(aiocbp, uuaiocb);
2468 		ops->store_error(uuaiocb, error);
2469 		ops->store_status(uuaiocb, status);
2470 	} else
2471 		AIO_UNLOCK(ki);
2472 
2473 	return (error);
2474 }
2475 
2476 int
2477 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2478 {
2479 	struct timespec ts, *tsp;
2480 	int error;
2481 
2482 	if (uap->timeout) {
2483 		/* Get timespec struct. */
2484 		error = copyin(uap->timeout, &ts, sizeof(ts));
2485 		if (error)
2486 			return (error);
2487 		tsp = &ts;
2488 	} else
2489 		tsp = NULL;
2490 
2491 	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
2492 }
2493 
2494 static int
2495 kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
2496     struct aiocb_ops *ops)
2497 {
2498 	struct proc *p = td->td_proc;
2499 	struct kaioinfo *ki;
2500 
2501 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
2502 		return (EINVAL);
2503 	ki = p->p_aioinfo;
2504 	if (ki == NULL)
2505 		aio_init_aioinfo(p);
2506 	return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
2507 }
2508 
2509 int
2510 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
2511 {
2512 
2513 	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
2514 }
2515 
2516 /* kqueue attach function */
2517 static int
2518 filt_aioattach(struct knote *kn)
2519 {
2520 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2521 
2522 	/*
2523 	 * The aiocbe pointer must be validated before using it, so
2524 	 * registration is restricted to the kernel; the user cannot
2525 	 * set EV_FLAG1.
2526 	 */
2527 	if ((kn->kn_flags & EV_FLAG1) == 0)
2528 		return (EPERM);
2529 	kn->kn_ptr.p_aio = aiocbe;
2530 	kn->kn_flags &= ~EV_FLAG1;
2531 
2532 	knlist_add(&aiocbe->klist, kn, 0);
2533 
2534 	return (0);
2535 }
2536 
2537 /* kqueue detach function */
2538 static void
2539 filt_aiodetach(struct knote *kn)
2540 {
2541 	struct knlist *knl;
2542 
2543 	knl = &kn->kn_ptr.p_aio->klist;
2544 	knl->kl_lock(knl->kl_lockarg);
2545 	if (!knlist_empty(knl))
2546 		knlist_remove(knl, kn, 1);
2547 	knl->kl_unlock(knl->kl_lockarg);
2548 }
2549 
2550 /* kqueue filter function */
2551 /*ARGSUSED*/
2552 static int
2553 filt_aio(struct knote *kn, long hint)
2554 {
2555 	struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
2556 
2557 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2558 	if (aiocbe->jobstate != JOBST_JOBFINISHED)
2559 		return (0);
2560 	kn->kn_flags |= EV_EOF;
2561 	return (1);
2562 }
2563 
2564 /* kqueue attach function */
2565 static int
2566 filt_lioattach(struct knote *kn)
2567 {
2568 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
2569 
2570 	/*
2571 	 * The aioliojob pointer must be validated before using it, so
2572 	 * registration is restricted to the kernel; the user cannot
2573 	 * set EV_FLAG1.
2574 	 */
2575 	if ((kn->kn_flags & EV_FLAG1) == 0)
2576 		return (EPERM);
2577 	kn->kn_ptr.p_lio = lj;
2578 	kn->kn_flags &= ~EV_FLAG1;
2579 
2580 	knlist_add(&lj->klist, kn, 0);
2581 
2582 	return (0);
2583 }
2584 
2585 /* kqueue detach function */
2586 static void
2587 filt_liodetach(struct knote *kn)
2588 {
2589 	struct knlist *knl;
2590 
2591 	knl = &kn->kn_ptr.p_lio->klist;
2592 	knl->kl_lock(knl->kl_lockarg);
2593 	if (!knlist_empty(knl))
2594 		knlist_remove(knl, kn, 1);
2595 	knl->kl_unlock(knl->kl_lockarg);
2596 }
2597 
2598 /* kqueue filter function */
2599 /*ARGSUSED*/
2600 static int
2601 filt_lio(struct knote *kn, long hint)
2602 {
2603 	struct aioliojob * lj = kn->kn_ptr.p_lio;
2604 
2605 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
2606 }
2607 
2608 #ifdef COMPAT_FREEBSD32
2609 
2610 struct __aiocb_private32 {
2611 	int32_t	status;
2612 	int32_t	error;
2613 	uint32_t kernelinfo;
2614 };
2615 
2616 typedef struct oaiocb32 {
2617 	int	aio_fildes;		/* File descriptor */
2618 	uint64_t aio_offset __packed;	/* File offset for I/O */
2619 	uint32_t aio_buf;		/* I/O buffer in process space */
2620 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2621 	struct	osigevent32 aio_sigevent; /* Signal to deliver */
2622 	int	aio_lio_opcode;		/* LIO opcode */
2623 	int	aio_reqprio;		/* Request priority -- ignored */
2624 	struct	__aiocb_private32 _aiocb_private;
2625 } oaiocb32_t;
2626 
2627 typedef struct aiocb32 {
2628 	int32_t	aio_fildes;		/* File descriptor */
2629 	uint64_t aio_offset __packed;	/* File offset for I/O */
2630 	uint32_t aio_buf;		/* I/O buffer in process space */
2631 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2632 	int	__spare__[2];
2633 	uint32_t __spare2__;
2634 	int	aio_lio_opcode;		/* LIO opcode */
2635 	int	aio_reqprio;		/* Request priority -- ignored */
2636 	struct __aiocb_private32 _aiocb_private;
2637 	struct sigevent32 aio_sigevent;	/* Signal to deliver */
2638 } aiocb32_t;
2639 
2640 static int
2641 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
2642 {
2643 
2644 	/*
2645 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
2646 	 * supported by AIO with the old sigevent structure.
2647 	 */
2648 	CP(*osig, *nsig, sigev_notify);
2649 	switch (nsig->sigev_notify) {
2650 	case SIGEV_NONE:
2651 		break;
2652 	case SIGEV_SIGNAL:
2653 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
2654 		break;
2655 	case SIGEV_KEVENT:
2656 		nsig->sigev_notify_kqueue =
2657 		    osig->__sigev_u.__sigev_notify_kqueue;
2658 		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
2659 		break;
2660 	default:
2661 		return (EINVAL);
2662 	}
2663 	return (0);
2664 }
2665 
2666 static int
2667 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
2668 {
2669 	struct oaiocb32 job32;
2670 	int error;
2671 
2672 	bzero(kjob, sizeof(struct aiocb));
2673 	error = copyin(ujob, &job32, sizeof(job32));
2674 	if (error)
2675 		return (error);
2676 
2677 	CP(job32, *kjob, aio_fildes);
2678 	CP(job32, *kjob, aio_offset);
2679 	PTRIN_CP(job32, *kjob, aio_buf);
2680 	CP(job32, *kjob, aio_nbytes);
2681 	CP(job32, *kjob, aio_lio_opcode);
2682 	CP(job32, *kjob, aio_reqprio);
2683 	CP(job32, *kjob, _aiocb_private.status);
2684 	CP(job32, *kjob, _aiocb_private.error);
2685 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2686 	return (convert_old_sigevent32(&job32.aio_sigevent,
2687 	    &kjob->aio_sigevent));
2688 }
2689 
2690 static int
2691 convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig)
2692 {
2693 
2694 	CP(*sig32, *sig, sigev_notify);
2695 	switch (sig->sigev_notify) {
2696 	case SIGEV_NONE:
2697 		break;
2698 	case SIGEV_THREAD_ID:
2699 		CP(*sig32, *sig, sigev_notify_thread_id);
2700 		/* FALLTHROUGH */
2701 	case SIGEV_SIGNAL:
2702 		CP(*sig32, *sig, sigev_signo);
2703 		break;
2704 	case SIGEV_KEVENT:
2705 		CP(*sig32, *sig, sigev_notify_kqueue);
2706 		CP(*sig32, *sig, sigev_notify_kevent_flags);
2707 		PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
2708 		break;
2709 	default:
2710 		return (EINVAL);
2711 	}
2712 	return (0);
2713 }
2714 
2715 static int
2716 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
2717 {
2718 	struct aiocb32 job32;
2719 	int error;
2720 
2721 	error = copyin(ujob, &job32, sizeof(job32));
2722 	if (error)
2723 		return (error);
2724 	CP(job32, *kjob, aio_fildes);
2725 	CP(job32, *kjob, aio_offset);
2726 	PTRIN_CP(job32, *kjob, aio_buf);
2727 	CP(job32, *kjob, aio_nbytes);
2728 	CP(job32, *kjob, aio_lio_opcode);
2729 	CP(job32, *kjob, aio_reqprio);
2730 	CP(job32, *kjob, _aiocb_private.status);
2731 	CP(job32, *kjob, _aiocb_private.error);
2732 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2733 	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
2734 }
2735 
2736 static long
2737 aiocb32_fetch_status(struct aiocb *ujob)
2738 {
2739 	struct aiocb32 *ujob32;
2740 
2741 	ujob32 = (struct aiocb32 *)ujob;
2742 	return (fuword32(&ujob32->_aiocb_private.status));
2743 }
2744 
2745 static long
2746 aiocb32_fetch_error(struct aiocb *ujob)
2747 {
2748 	struct aiocb32 *ujob32;
2749 
2750 	ujob32 = (struct aiocb32 *)ujob;
2751 	return (fuword32(&ujob32->_aiocb_private.error));
2752 }
2753 
2754 static int
2755 aiocb32_store_status(struct aiocb *ujob, long status)
2756 {
2757 	struct aiocb32 *ujob32;
2758 
2759 	ujob32 = (struct aiocb32 *)ujob;
2760 	return (suword32(&ujob32->_aiocb_private.status, status));
2761 }
2762 
2763 static int
2764 aiocb32_store_error(struct aiocb *ujob, long error)
2765 {
2766 	struct aiocb32 *ujob32;
2767 
2768 	ujob32 = (struct aiocb32 *)ujob;
2769 	return (suword32(&ujob32->_aiocb_private.error, error));
2770 }
2771 
2772 static int
2773 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
2774 {
2775 	struct aiocb32 *ujob32;
2776 
2777 	ujob32 = (struct aiocb32 *)ujob;
2778 	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
2779 }
2780 
2781 static int
2782 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
2783 {
2784 
2785 	return (suword32(ujobp, (long)ujob));
2786 }
2787 
2788 static struct aiocb_ops aiocb32_ops = {
2789 	.copyin = aiocb32_copyin,
2790 	.fetch_status = aiocb32_fetch_status,
2791 	.fetch_error = aiocb32_fetch_error,
2792 	.store_status = aiocb32_store_status,
2793 	.store_error = aiocb32_store_error,
2794 	.store_kernelinfo = aiocb32_store_kernelinfo,
2795 	.store_aiocb = aiocb32_store_aiocb,
2796 };
2797 
2798 static struct aiocb_ops aiocb32_ops_osigevent = {
2799 	.copyin = aiocb32_copyin_old_sigevent,
2800 	.fetch_status = aiocb32_fetch_status,
2801 	.fetch_error = aiocb32_fetch_error,
2802 	.store_status = aiocb32_store_status,
2803 	.store_error = aiocb32_store_error,
2804 	.store_kernelinfo = aiocb32_store_kernelinfo,
2805 	.store_aiocb = aiocb32_store_aiocb,
2806 };
2807 
2808 int
2809 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
2810 {
2811 
2812 	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2813 }
2814 
2815 int
2816 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
2817 {
2818 	struct timespec32 ts32;
2819 	struct timespec ts, *tsp;
2820 	struct aiocb **ujoblist;
2821 	uint32_t *ujoblist32;
2822 	int error, i;
2823 
2824 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
2825 		return (EINVAL);
2826 
2827 	if (uap->timeout) {
2828 		/* Get timespec struct. */
2829 		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
2830 			return (error);
2831 		CP(ts32, ts, tv_sec);
2832 		CP(ts32, ts, tv_nsec);
2833 		tsp = &ts;
2834 	} else
2835 		tsp = NULL;
2836 
2837 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
2838 	ujoblist32 = (uint32_t *)ujoblist;
2839 	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
2840 	    sizeof(ujoblist32[0]));
2841 	if (error == 0) {
2842 		for (i = uap->nent; i > 0; i--)
2843 			ujoblist[i] = PTRIN(ujoblist32[i]);
2844 
2845 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2846 	}
2847 	uma_zfree(aiol_zone, ujoblist);
2848 	return (error);
2849 }
2850 
2851 int
2852 freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap)
2853 {
2854 
2855 	return (sys_aio_cancel(td, (struct aio_cancel_args *)uap));
2856 }
2857 
2858 int
2859 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
2860 {
2861 
2862 	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2863 }
2864 
2865 int
2866 freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap)
2867 {
2868 
2869 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2870 	    &aiocb32_ops_osigevent));
2871 }
2872 
2873 int
2874 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
2875 {
2876 
2877 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2878 	    &aiocb32_ops));
2879 }
2880 
2881 int
2882 freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap)
2883 {
2884 
2885 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2886 	    &aiocb32_ops_osigevent));
2887 }
2888 
2889 int
2890 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
2891 {
2892 
2893 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2894 	    &aiocb32_ops));
2895 }
2896 
2897 int
2898 freebsd32_aio_waitcomplete(struct thread *td,
2899     struct freebsd32_aio_waitcomplete_args *uap)
2900 {
2901 	struct timespec32 ts32;
2902 	struct timespec ts, *tsp;
2903 	int error;
2904 
2905 	if (uap->timeout) {
2906 		/* Get timespec struct. */
2907 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
2908 		if (error)
2909 			return (error);
2910 		CP(ts32, ts, tv_sec);
2911 		CP(ts32, ts, tv_nsec);
2912 		tsp = &ts;
2913 	} else
2914 		tsp = NULL;
2915 
2916 	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
2917 	    &aiocb32_ops));
2918 }
2919 
2920 int
2921 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
2922 {
2923 
2924 	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
2925 	    &aiocb32_ops));
2926 }
2927 
2928 int
2929 freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap)
2930 {
2931 	struct aiocb **acb_list;
2932 	struct sigevent *sigp, sig;
2933 	struct osigevent32 osig;
2934 	uint32_t *acb_list32;
2935 	int error, i, nent;
2936 
2937 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2938 		return (EINVAL);
2939 
2940 	nent = uap->nent;
2941 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2942 		return (EINVAL);
2943 
2944 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2945 		error = copyin(uap->sig, &osig, sizeof(osig));
2946 		if (error)
2947 			return (error);
2948 		error = convert_old_sigevent32(&osig, &sig);
2949 		if (error)
2950 			return (error);
2951 		sigp = &sig;
2952 	} else
2953 		sigp = NULL;
2954 
2955 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
2956 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
2957 	if (error) {
2958 		free(acb_list32, M_LIO);
2959 		return (error);
2960 	}
2961 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2962 	for (i = 0; i < nent; i++)
2963 		acb_list[i] = PTRIN(acb_list32[i]);
2964 	free(acb_list32, M_LIO);
2965 
2966 	error = kern_lio_listio(td, uap->mode,
2967 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2968 	    &aiocb32_ops_osigevent);
2969 	free(acb_list, M_LIO);
2970 	return (error);
2971 }
2972 
2973 int
2974 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
2975 {
2976 	struct aiocb **acb_list;
2977 	struct sigevent *sigp, sig;
2978 	struct sigevent32 sig32;
2979 	uint32_t *acb_list32;
2980 	int error, i, nent;
2981 
2982 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2983 		return (EINVAL);
2984 
2985 	nent = uap->nent;
2986 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2987 		return (EINVAL);
2988 
2989 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2990 		error = copyin(uap->sig, &sig32, sizeof(sig32));
2991 		if (error)
2992 			return (error);
2993 		error = convert_sigevent32(&sig32, &sig);
2994 		if (error)
2995 			return (error);
2996 		sigp = &sig;
2997 	} else
2998 		sigp = NULL;
2999 
3000 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
3001 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
3002 	if (error) {
3003 		free(acb_list32, M_LIO);
3004 		return (error);
3005 	}
3006 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
3007 	for (i = 0; i < nent; i++)
3008 		acb_list[i] = PTRIN(acb_list32[i]);
3009 	free(acb_list32, M_LIO);
3010 
3011 	error = kern_lio_listio(td, uap->mode,
3012 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
3013 	    &aiocb32_ops);
3014 	free(acb_list, M_LIO);
3015 	return (error);
3016 }
3017 
3018 #endif
3019