xref: /freebsd/sys/kern/vfs_aio.c (revision 730cecb05aaf016ac52ef7cfc691ccec3a0408cd)
1 /*-
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  */
16 
17 /*
18  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
19  */
20 
21 #include <sys/cdefs.h>
22 __FBSDID("$FreeBSD$");
23 
24 #include "opt_compat.h"
25 
26 #include <sys/param.h>
27 #include <sys/systm.h>
28 #include <sys/malloc.h>
29 #include <sys/bio.h>
30 #include <sys/buf.h>
31 #include <sys/capability.h>
32 #include <sys/eventhandler.h>
33 #include <sys/sysproto.h>
34 #include <sys/filedesc.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/kthread.h>
38 #include <sys/fcntl.h>
39 #include <sys/file.h>
40 #include <sys/limits.h>
41 #include <sys/lock.h>
42 #include <sys/mutex.h>
43 #include <sys/unistd.h>
44 #include <sys/posix4.h>
45 #include <sys/proc.h>
46 #include <sys/resourcevar.h>
47 #include <sys/signalvar.h>
48 #include <sys/protosw.h>
49 #include <sys/rwlock.h>
50 #include <sys/sema.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <sys/syscall.h>
54 #include <sys/sysent.h>
55 #include <sys/sysctl.h>
56 #include <sys/sx.h>
57 #include <sys/taskqueue.h>
58 #include <sys/vnode.h>
59 #include <sys/conf.h>
60 #include <sys/event.h>
61 #include <sys/mount.h>
62 
63 #include <machine/atomic.h>
64 
65 #include <vm/vm.h>
66 #include <vm/vm_extern.h>
67 #include <vm/pmap.h>
68 #include <vm/vm_map.h>
69 #include <vm/vm_object.h>
70 #include <vm/uma.h>
71 #include <sys/aio.h>
72 
73 #include "opt_vfs_aio.h"
74 
75 /*
76  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
77  * overflow. (XXX will be removed soon.)
78  */
79 static u_long jobrefid;
80 
81 /*
82  * Counter for aio_fsync.
83  */
84 static uint64_t jobseqno;
85 
86 #define JOBST_NULL		0
87 #define JOBST_JOBQSOCK		1
88 #define JOBST_JOBQGLOBAL	2
89 #define JOBST_JOBRUNNING	3
90 #define JOBST_JOBFINISHED	4
91 #define JOBST_JOBQBUF		5
92 #define JOBST_JOBQSYNC		6
93 
94 #ifndef MAX_AIO_PER_PROC
95 #define MAX_AIO_PER_PROC	32
96 #endif
97 
98 #ifndef MAX_AIO_QUEUE_PER_PROC
99 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
100 #endif
101 
102 #ifndef MAX_AIO_PROCS
103 #define MAX_AIO_PROCS		32
104 #endif
105 
106 #ifndef MAX_AIO_QUEUE
107 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
108 #endif
109 
110 #ifndef TARGET_AIO_PROCS
111 #define TARGET_AIO_PROCS	4
112 #endif
113 
114 #ifndef MAX_BUF_AIO
115 #define MAX_BUF_AIO		16
116 #endif
117 
118 #ifndef AIOD_TIMEOUT_DEFAULT
119 #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
120 #endif
121 
122 #ifndef AIOD_LIFETIME_DEFAULT
123 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
124 #endif
125 
126 FEATURE(aio, "Asynchronous I/O");
127 
128 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
129 
130 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
131 
132 static int max_aio_procs = MAX_AIO_PROCS;
133 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
134 	CTLFLAG_RW, &max_aio_procs, 0,
135 	"Maximum number of kernel threads to use for handling async IO ");
136 
137 static int num_aio_procs = 0;
138 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
139 	CTLFLAG_RD, &num_aio_procs, 0,
140 	"Number of presently active kernel threads for async IO");
141 
142 /*
143  * The code will adjust the actual number of AIO processes towards this
144  * number when it gets a chance.
145  */
146 static int target_aio_procs = TARGET_AIO_PROCS;
147 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
148 	0, "Preferred number of ready kernel threads for async IO");
149 
150 static int max_queue_count = MAX_AIO_QUEUE;
151 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
152     "Maximum number of aio requests to queue, globally");
153 
154 static int num_queue_count = 0;
155 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
156     "Number of queued aio requests");
157 
158 static int num_buf_aio = 0;
159 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
160     "Number of aio requests presently handled by the buf subsystem");
161 
162 /* Number of async I/O thread in the process of being started */
163 /* XXX This should be local to aio_aqueue() */
164 static int num_aio_resv_start = 0;
165 
166 static int aiod_timeout;
167 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
168     "Timeout value for synchronous aio operations");
169 
170 static int aiod_lifetime;
171 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
172     "Maximum lifetime for idle aiod");
173 
174 static int unloadable = 0;
175 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
176     "Allow unload of aio (not recommended)");
177 
178 
179 static int max_aio_per_proc = MAX_AIO_PER_PROC;
180 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
181     0, "Maximum active aio requests per process (stored in the process)");
182 
183 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
184 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
185     &max_aio_queue_per_proc, 0,
186     "Maximum queued aio requests per process (stored in the process)");
187 
188 static int max_buf_aio = MAX_BUF_AIO;
189 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
190     "Maximum buf aio requests per process (stored in the process)");
191 
192 typedef struct oaiocb {
193 	int	aio_fildes;		/* File descriptor */
194 	off_t	aio_offset;		/* File offset for I/O */
195 	volatile void *aio_buf;         /* I/O buffer in process space */
196 	size_t	aio_nbytes;		/* Number of bytes for I/O */
197 	struct	osigevent aio_sigevent;	/* Signal to deliver */
198 	int	aio_lio_opcode;		/* LIO opcode */
199 	int	aio_reqprio;		/* Request priority -- ignored */
200 	struct	__aiocb_private	_aiocb_private;
201 } oaiocb_t;
202 
203 /*
204  * Below is a key of locks used to protect each member of struct aiocblist
205  * aioliojob and kaioinfo and any backends.
206  *
207  * * - need not protected
208  * a - locked by kaioinfo lock
209  * b - locked by backend lock, the backend lock can be null in some cases,
210  *     for example, BIO belongs to this type, in this case, proc lock is
211  *     reused.
212  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
213  */
214 
215 /*
216  * Current, there is only two backends: BIO and generic file I/O.
217  * socket I/O is served by generic file I/O, this is not a good idea, since
218  * disk file I/O and any other types without O_NONBLOCK flag can block daemon
219  * threads, if there is no thread to serve socket I/O, the socket I/O will be
220  * delayed too long or starved, we should create some threads dedicated to
221  * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
222  * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
223  * structure is not safe because there is race between userland and aio
224  * daemons.
225  */
226 
227 struct aiocblist {
228 	TAILQ_ENTRY(aiocblist) list;	/* (b) internal list of for backend */
229 	TAILQ_ENTRY(aiocblist) plist;	/* (a) list of jobs for each backend */
230 	TAILQ_ENTRY(aiocblist) allist;  /* (a) list of all jobs in proc */
231 	int	jobflags;		/* (a) job flags */
232 	int	jobstate;		/* (b) job state */
233 	int	inputcharge;		/* (*) input blockes */
234 	int	outputcharge;		/* (*) output blockes */
235 	struct	buf *bp;		/* (*) private to BIO backend,
236 				  	 * buffer pointer
237 					 */
238 	struct	proc *userproc;		/* (*) user process */
239 	struct  ucred *cred;		/* (*) active credential when created */
240 	struct	file *fd_file;		/* (*) pointer to file structure */
241 	struct	aioliojob *lio;		/* (*) optional lio job */
242 	struct	aiocb *uuaiocb;		/* (*) pointer in userspace of aiocb */
243 	struct	knlist klist;		/* (a) list of knotes */
244 	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
245 	ksiginfo_t ksi;			/* (a) realtime signal info */
246 	struct	task biotask;		/* (*) private to BIO backend */
247 	uint64_t seqno;			/* (*) job number */
248 	int	pending;		/* (a) number of pending I/O, aio_fsync only */
249 };
250 
251 /* jobflags */
252 #define AIOCBLIST_DONE		0x01
253 #define AIOCBLIST_BUFDONE	0x02
254 #define AIOCBLIST_RUNDOWN	0x04
255 #define AIOCBLIST_CHECKSYNC	0x08
256 
257 /*
258  * AIO process info
259  */
260 #define AIOP_FREE	0x1			/* proc on free queue */
261 
262 struct aiothreadlist {
263 	int aiothreadflags;			/* (c) AIO proc flags */
264 	TAILQ_ENTRY(aiothreadlist) list;	/* (c) list of processes */
265 	struct thread *aiothread;		/* (*) the AIO thread */
266 };
267 
268 /*
269  * data-structure for lio signal management
270  */
271 struct aioliojob {
272 	int	lioj_flags;			/* (a) listio flags */
273 	int	lioj_count;			/* (a) listio flags */
274 	int	lioj_finished_count;		/* (a) listio flags */
275 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
276 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
277 	struct  knlist klist;			/* (a) list of knotes */
278 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
279 };
280 
281 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
282 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
283 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
284 
285 /*
286  * per process aio data structure
287  */
288 struct kaioinfo {
289 	struct mtx	kaio_mtx;	/* the lock to protect this struct */
290 	int	kaio_flags;		/* (a) per process kaio flags */
291 	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
292 	int	kaio_active_count;	/* (c) number of currently used AIOs */
293 	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
294 	int	kaio_count;		/* (a) size of AIO queue */
295 	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
296 	int	kaio_buffer_count;	/* (a) number of physio buffers */
297 	TAILQ_HEAD(,aiocblist) kaio_all;	/* (a) all AIOs in the process */
298 	TAILQ_HEAD(,aiocblist) kaio_done;	/* (a) done queue for process */
299 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
300 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* (a) job queue for process */
301 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* (a) buffer job queue for process */
302 	TAILQ_HEAD(,aiocblist) kaio_sockqueue;  /* (a) queue for aios waiting on sockets,
303 						 *  NOT USED YET.
304 						 */
305 	TAILQ_HEAD(,aiocblist) kaio_syncqueue;	/* (a) queue for aio_fsync */
306 	struct	task	kaio_task;	/* (*) task to kick aio threads */
307 };
308 
309 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
310 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
311 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
312 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
313 
314 #define KAIO_RUNDOWN	0x1	/* process is being run down */
315 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
316 
317 /*
318  * Operations used to interact with userland aio control blocks.
319  * Different ABIs provide their own operations.
320  */
321 struct aiocb_ops {
322 	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
323 	long	(*fetch_status)(struct aiocb *ujob);
324 	long	(*fetch_error)(struct aiocb *ujob);
325 	int	(*store_status)(struct aiocb *ujob, long status);
326 	int	(*store_error)(struct aiocb *ujob, long error);
327 	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
328 	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
329 };
330 
331 static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* (c) Idle daemons */
332 static struct sema aio_newproc_sem;
333 static struct mtx aio_job_mtx;
334 static struct mtx aio_sock_mtx;
335 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* (c) Async job list */
336 static struct unrhdr *aiod_unr;
337 
338 void		aio_init_aioinfo(struct proc *p);
339 static int	aio_onceonly(void);
340 static int	aio_free_entry(struct aiocblist *aiocbe);
341 static void	aio_process(struct aiocblist *aiocbe);
342 static int	aio_newproc(int *);
343 int		aio_aqueue(struct thread *td, struct aiocb *job,
344 			struct aioliojob *lio, int type, struct aiocb_ops *ops);
345 static void	aio_physwakeup(struct buf *bp);
346 static void	aio_proc_rundown(void *arg, struct proc *p);
347 static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
348 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
349 static void	biohelper(void *, int);
350 static void	aio_daemon(void *param);
351 static void	aio_swake_cb(struct socket *, struct sockbuf *);
352 static int	aio_unload(void);
353 static void	aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
354 #define DONE_BUF	1
355 #define DONE_QUEUE	2
356 static int	aio_kick(struct proc *userp);
357 static void	aio_kick_nowait(struct proc *userp);
358 static void	aio_kick_helper(void *context, int pending);
359 static int	filt_aioattach(struct knote *kn);
360 static void	filt_aiodetach(struct knote *kn);
361 static int	filt_aio(struct knote *kn, long hint);
362 static int	filt_lioattach(struct knote *kn);
363 static void	filt_liodetach(struct knote *kn);
364 static int	filt_lio(struct knote *kn, long hint);
365 
366 /*
367  * Zones for:
368  * 	kaio	Per process async io info
369  *	aiop	async io thread data
370  *	aiocb	async io jobs
371  *	aiol	list io job pointer - internal to aio_suspend XXX
372  *	aiolio	list io jobs
373  */
374 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
375 
376 /* kqueue filters for aio */
377 static struct filterops aio_filtops = {
378 	.f_isfd = 0,
379 	.f_attach = filt_aioattach,
380 	.f_detach = filt_aiodetach,
381 	.f_event = filt_aio,
382 };
383 static struct filterops lio_filtops = {
384 	.f_isfd = 0,
385 	.f_attach = filt_lioattach,
386 	.f_detach = filt_liodetach,
387 	.f_event = filt_lio
388 };
389 
390 static eventhandler_tag exit_tag, exec_tag;
391 
392 TASKQUEUE_DEFINE_THREAD(aiod_bio);
393 
394 /*
395  * Main operations function for use as a kernel module.
396  */
397 static int
398 aio_modload(struct module *module, int cmd, void *arg)
399 {
400 	int error = 0;
401 
402 	switch (cmd) {
403 	case MOD_LOAD:
404 		aio_onceonly();
405 		break;
406 	case MOD_UNLOAD:
407 		error = aio_unload();
408 		break;
409 	case MOD_SHUTDOWN:
410 		break;
411 	default:
412 		error = EINVAL;
413 		break;
414 	}
415 	return (error);
416 }
417 
418 static moduledata_t aio_mod = {
419 	"aio",
420 	&aio_modload,
421 	NULL
422 };
423 
424 static struct syscall_helper_data aio_syscalls[] = {
425 	SYSCALL_INIT_HELPER(aio_cancel),
426 	SYSCALL_INIT_HELPER(aio_error),
427 	SYSCALL_INIT_HELPER(aio_fsync),
428 	SYSCALL_INIT_HELPER(aio_read),
429 	SYSCALL_INIT_HELPER(aio_return),
430 	SYSCALL_INIT_HELPER(aio_suspend),
431 	SYSCALL_INIT_HELPER(aio_waitcomplete),
432 	SYSCALL_INIT_HELPER(aio_write),
433 	SYSCALL_INIT_HELPER(lio_listio),
434 	SYSCALL_INIT_HELPER(oaio_read),
435 	SYSCALL_INIT_HELPER(oaio_write),
436 	SYSCALL_INIT_HELPER(olio_listio),
437 	SYSCALL_INIT_LAST
438 };
439 
440 #ifdef COMPAT_FREEBSD32
441 #include <sys/mount.h>
442 #include <sys/socket.h>
443 #include <compat/freebsd32/freebsd32.h>
444 #include <compat/freebsd32/freebsd32_proto.h>
445 #include <compat/freebsd32/freebsd32_signal.h>
446 #include <compat/freebsd32/freebsd32_syscall.h>
447 #include <compat/freebsd32/freebsd32_util.h>
448 
449 static struct syscall_helper_data aio32_syscalls[] = {
450 	SYSCALL32_INIT_HELPER(freebsd32_aio_return),
451 	SYSCALL32_INIT_HELPER(freebsd32_aio_suspend),
452 	SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
453 	SYSCALL32_INIT_HELPER(freebsd32_aio_error),
454 	SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
455 	SYSCALL32_INIT_HELPER(freebsd32_aio_read),
456 	SYSCALL32_INIT_HELPER(freebsd32_aio_write),
457 	SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
458 	SYSCALL32_INIT_HELPER(freebsd32_lio_listio),
459 	SYSCALL32_INIT_HELPER(freebsd32_oaio_read),
460 	SYSCALL32_INIT_HELPER(freebsd32_oaio_write),
461 	SYSCALL32_INIT_HELPER(freebsd32_olio_listio),
462 	SYSCALL_INIT_LAST
463 };
464 #endif
465 
466 DECLARE_MODULE(aio, aio_mod,
467 	SI_SUB_VFS, SI_ORDER_ANY);
468 MODULE_VERSION(aio, 1);
469 
470 /*
471  * Startup initialization
472  */
473 static int
474 aio_onceonly(void)
475 {
476 	int error;
477 
478 	/* XXX: should probably just use so->callback */
479 	aio_swake = &aio_swake_cb;
480 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
481 	    EVENTHANDLER_PRI_ANY);
482 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
483 	    EVENTHANDLER_PRI_ANY);
484 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
485 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
486 	TAILQ_INIT(&aio_freeproc);
487 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
488 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
489 	mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
490 	TAILQ_INIT(&aio_jobs);
491 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
492 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
493 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
494 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
495 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
496 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
497 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
498 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
499 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
500 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
501 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
502 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
503 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
504 	jobrefid = 1;
505 	async_io_version = _POSIX_VERSION;
506 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
507 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
508 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
509 
510 	error = syscall_helper_register(aio_syscalls);
511 	if (error)
512 		return (error);
513 #ifdef COMPAT_FREEBSD32
514 	error = syscall32_helper_register(aio32_syscalls);
515 	if (error)
516 		return (error);
517 #endif
518 	return (0);
519 }
520 
521 /*
522  * Callback for unload of AIO when used as a module.
523  */
524 static int
525 aio_unload(void)
526 {
527 	int error;
528 
529 	/*
530 	 * XXX: no unloads by default, it's too dangerous.
531 	 * perhaps we could do it if locked out callers and then
532 	 * did an aio_proc_rundown() on each process.
533 	 *
534 	 * jhb: aio_proc_rundown() needs to run on curproc though,
535 	 * so I don't think that would fly.
536 	 */
537 	if (!unloadable)
538 		return (EOPNOTSUPP);
539 
540 #ifdef COMPAT_FREEBSD32
541 	syscall32_helper_unregister(aio32_syscalls);
542 #endif
543 	syscall_helper_unregister(aio_syscalls);
544 
545 	error = kqueue_del_filteropts(EVFILT_AIO);
546 	if (error)
547 		return error;
548 	error = kqueue_del_filteropts(EVFILT_LIO);
549 	if (error)
550 		return error;
551 	async_io_version = 0;
552 	aio_swake = NULL;
553 	taskqueue_free(taskqueue_aiod_bio);
554 	delete_unrhdr(aiod_unr);
555 	uma_zdestroy(kaio_zone);
556 	uma_zdestroy(aiop_zone);
557 	uma_zdestroy(aiocb_zone);
558 	uma_zdestroy(aiol_zone);
559 	uma_zdestroy(aiolio_zone);
560 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
561 	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
562 	mtx_destroy(&aio_job_mtx);
563 	mtx_destroy(&aio_sock_mtx);
564 	sema_destroy(&aio_newproc_sem);
565 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
566 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
567 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
568 	return (0);
569 }
570 
571 /*
572  * Init the per-process aioinfo structure.  The aioinfo limits are set
573  * per-process for user limit (resource) management.
574  */
575 void
576 aio_init_aioinfo(struct proc *p)
577 {
578 	struct kaioinfo *ki;
579 
580 	ki = uma_zalloc(kaio_zone, M_WAITOK);
581 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
582 	ki->kaio_flags = 0;
583 	ki->kaio_maxactive_count = max_aio_per_proc;
584 	ki->kaio_active_count = 0;
585 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
586 	ki->kaio_count = 0;
587 	ki->kaio_ballowed_count = max_buf_aio;
588 	ki->kaio_buffer_count = 0;
589 	TAILQ_INIT(&ki->kaio_all);
590 	TAILQ_INIT(&ki->kaio_done);
591 	TAILQ_INIT(&ki->kaio_jobqueue);
592 	TAILQ_INIT(&ki->kaio_bufqueue);
593 	TAILQ_INIT(&ki->kaio_liojoblist);
594 	TAILQ_INIT(&ki->kaio_sockqueue);
595 	TAILQ_INIT(&ki->kaio_syncqueue);
596 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
597 	PROC_LOCK(p);
598 	if (p->p_aioinfo == NULL) {
599 		p->p_aioinfo = ki;
600 		PROC_UNLOCK(p);
601 	} else {
602 		PROC_UNLOCK(p);
603 		mtx_destroy(&ki->kaio_mtx);
604 		uma_zfree(kaio_zone, ki);
605 	}
606 
607 	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
608 		aio_newproc(NULL);
609 }
610 
611 static int
612 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
613 {
614 	struct thread *td;
615 	int error;
616 
617 	error = sigev_findtd(p, sigev, &td);
618 	if (error)
619 		return (error);
620 	if (!KSI_ONQ(ksi)) {
621 		ksiginfo_set_sigev(ksi, sigev);
622 		ksi->ksi_code = SI_ASYNCIO;
623 		ksi->ksi_flags |= KSI_EXT | KSI_INS;
624 		tdsendsignal(p, td, ksi->ksi_signo, ksi);
625 	}
626 	PROC_UNLOCK(p);
627 	return (error);
628 }
629 
630 /*
631  * Free a job entry.  Wait for completion if it is currently active, but don't
632  * delay forever.  If we delay, we return a flag that says that we have to
633  * restart the queue scan.
634  */
635 static int
636 aio_free_entry(struct aiocblist *aiocbe)
637 {
638 	struct kaioinfo *ki;
639 	struct aioliojob *lj;
640 	struct proc *p;
641 
642 	p = aiocbe->userproc;
643 	MPASS(curproc == p);
644 	ki = p->p_aioinfo;
645 	MPASS(ki != NULL);
646 
647 	AIO_LOCK_ASSERT(ki, MA_OWNED);
648 	MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
649 
650 	atomic_subtract_int(&num_queue_count, 1);
651 
652 	ki->kaio_count--;
653 	MPASS(ki->kaio_count >= 0);
654 
655 	TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
656 	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
657 
658 	lj = aiocbe->lio;
659 	if (lj) {
660 		lj->lioj_count--;
661 		lj->lioj_finished_count--;
662 
663 		if (lj->lioj_count == 0) {
664 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
665 			/* lio is going away, we need to destroy any knotes */
666 			knlist_delete(&lj->klist, curthread, 1);
667 			PROC_LOCK(p);
668 			sigqueue_take(&lj->lioj_ksi);
669 			PROC_UNLOCK(p);
670 			uma_zfree(aiolio_zone, lj);
671 		}
672 	}
673 
674 	/* aiocbe is going away, we need to destroy any knotes */
675 	knlist_delete(&aiocbe->klist, curthread, 1);
676 	PROC_LOCK(p);
677 	sigqueue_take(&aiocbe->ksi);
678 	PROC_UNLOCK(p);
679 
680 	MPASS(aiocbe->bp == NULL);
681 	aiocbe->jobstate = JOBST_NULL;
682 	AIO_UNLOCK(ki);
683 
684 	/*
685 	 * The thread argument here is used to find the owning process
686 	 * and is also passed to fo_close() which may pass it to various
687 	 * places such as devsw close() routines.  Because of that, we
688 	 * need a thread pointer from the process owning the job that is
689 	 * persistent and won't disappear out from under us or move to
690 	 * another process.
691 	 *
692 	 * Currently, all the callers of this function call it to remove
693 	 * an aiocblist from the current process' job list either via a
694 	 * syscall or due to the current process calling exit() or
695 	 * execve().  Thus, we know that p == curproc.  We also know that
696 	 * curthread can't exit since we are curthread.
697 	 *
698 	 * Therefore, we use curthread as the thread to pass to
699 	 * knlist_delete().  This does mean that it is possible for the
700 	 * thread pointer at close time to differ from the thread pointer
701 	 * at open time, but this is already true of file descriptors in
702 	 * a multithreaded process.
703 	 */
704 	fdrop(aiocbe->fd_file, curthread);
705 	crfree(aiocbe->cred);
706 	uma_zfree(aiocb_zone, aiocbe);
707 	AIO_LOCK(ki);
708 
709 	return (0);
710 }
711 
712 static void
713 aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
714 {
715    	aio_proc_rundown(arg, p);
716 }
717 
718 /*
719  * Rundown the jobs for a given process.
720  */
721 static void
722 aio_proc_rundown(void *arg, struct proc *p)
723 {
724 	struct kaioinfo *ki;
725 	struct aioliojob *lj;
726 	struct aiocblist *cbe, *cbn;
727 	struct file *fp;
728 	struct socket *so;
729 	int remove;
730 
731 	KASSERT(curthread->td_proc == p,
732 	    ("%s: called on non-curproc", __func__));
733 	ki = p->p_aioinfo;
734 	if (ki == NULL)
735 		return;
736 
737 	AIO_LOCK(ki);
738 	ki->kaio_flags |= KAIO_RUNDOWN;
739 
740 restart:
741 
742 	/*
743 	 * Try to cancel all pending requests. This code simulates
744 	 * aio_cancel on all pending I/O requests.
745 	 */
746 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
747 		remove = 0;
748 		mtx_lock(&aio_job_mtx);
749 		if (cbe->jobstate == JOBST_JOBQGLOBAL) {
750 			TAILQ_REMOVE(&aio_jobs, cbe, list);
751 			remove = 1;
752 		} else if (cbe->jobstate == JOBST_JOBQSOCK) {
753 			fp = cbe->fd_file;
754 			MPASS(fp->f_type == DTYPE_SOCKET);
755 			so = fp->f_data;
756 			TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
757 			remove = 1;
758 		} else if (cbe->jobstate == JOBST_JOBQSYNC) {
759 			TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
760 			remove = 1;
761 		}
762 		mtx_unlock(&aio_job_mtx);
763 
764 		if (remove) {
765 			cbe->jobstate = JOBST_JOBFINISHED;
766 			cbe->uaiocb._aiocb_private.status = -1;
767 			cbe->uaiocb._aiocb_private.error = ECANCELED;
768 			TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
769 			aio_bio_done_notify(p, cbe, DONE_QUEUE);
770 		}
771 	}
772 
773 	/* Wait for all running I/O to be finished */
774 	if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
775 	    TAILQ_FIRST(&ki->kaio_jobqueue)) {
776 		ki->kaio_flags |= KAIO_WAKEUP;
777 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
778 		goto restart;
779 	}
780 
781 	/* Free all completed I/O requests. */
782 	while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
783 		aio_free_entry(cbe);
784 
785 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
786 		if (lj->lioj_count == 0) {
787 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
788 			knlist_delete(&lj->klist, curthread, 1);
789 			PROC_LOCK(p);
790 			sigqueue_take(&lj->lioj_ksi);
791 			PROC_UNLOCK(p);
792 			uma_zfree(aiolio_zone, lj);
793 		} else {
794 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
795 			    lj->lioj_count, lj->lioj_finished_count);
796 		}
797 	}
798 	AIO_UNLOCK(ki);
799 	taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
800 	mtx_destroy(&ki->kaio_mtx);
801 	uma_zfree(kaio_zone, ki);
802 	p->p_aioinfo = NULL;
803 }
804 
805 /*
806  * Select a job to run (called by an AIO daemon).
807  */
808 static struct aiocblist *
809 aio_selectjob(struct aiothreadlist *aiop)
810 {
811 	struct aiocblist *aiocbe;
812 	struct kaioinfo *ki;
813 	struct proc *userp;
814 
815 	mtx_assert(&aio_job_mtx, MA_OWNED);
816 	TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
817 		userp = aiocbe->userproc;
818 		ki = userp->p_aioinfo;
819 
820 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
821 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
822 			/* Account for currently active jobs. */
823 			ki->kaio_active_count++;
824 			aiocbe->jobstate = JOBST_JOBRUNNING;
825 			break;
826 		}
827 	}
828 	return (aiocbe);
829 }
830 
831 /*
832  *  Move all data to a permanent storage device, this code
833  *  simulates fsync syscall.
834  */
835 static int
836 aio_fsync_vnode(struct thread *td, struct vnode *vp)
837 {
838 	struct mount *mp;
839 	int error;
840 
841 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
842 		goto drop;
843 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
844 	if (vp->v_object != NULL) {
845 		VM_OBJECT_WLOCK(vp->v_object);
846 		vm_object_page_clean(vp->v_object, 0, 0, 0);
847 		VM_OBJECT_WUNLOCK(vp->v_object);
848 	}
849 	error = VOP_FSYNC(vp, MNT_WAIT, td);
850 
851 	VOP_UNLOCK(vp, 0);
852 	vn_finished_write(mp);
853 drop:
854 	return (error);
855 }
856 
857 /*
858  * The AIO processing activity.  This is the code that does the I/O request for
859  * the non-physio version of the operations.  The normal vn operations are used,
860  * and this code should work in all instances for every type of file, including
861  * pipes, sockets, fifos, and regular files.
862  *
863  * XXX I don't think it works well for socket, pipe, and fifo.
864  */
865 static void
866 aio_process(struct aiocblist *aiocbe)
867 {
868 	struct ucred *td_savedcred;
869 	struct thread *td;
870 	struct aiocb *cb;
871 	struct file *fp;
872 	struct socket *so;
873 	struct uio auio;
874 	struct iovec aiov;
875 	int cnt;
876 	int error;
877 	int oublock_st, oublock_end;
878 	int inblock_st, inblock_end;
879 
880 	td = curthread;
881 	td_savedcred = td->td_ucred;
882 	td->td_ucred = aiocbe->cred;
883 	cb = &aiocbe->uaiocb;
884 	fp = aiocbe->fd_file;
885 
886 	if (cb->aio_lio_opcode == LIO_SYNC) {
887 		error = 0;
888 		cnt = 0;
889 		if (fp->f_vnode != NULL)
890 			error = aio_fsync_vnode(td, fp->f_vnode);
891 		cb->_aiocb_private.error = error;
892 		cb->_aiocb_private.status = 0;
893 		td->td_ucred = td_savedcred;
894 		return;
895 	}
896 
897 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
898 	aiov.iov_len = cb->aio_nbytes;
899 
900 	auio.uio_iov = &aiov;
901 	auio.uio_iovcnt = 1;
902 	auio.uio_offset = cb->aio_offset;
903 	auio.uio_resid = cb->aio_nbytes;
904 	cnt = cb->aio_nbytes;
905 	auio.uio_segflg = UIO_USERSPACE;
906 	auio.uio_td = td;
907 
908 	inblock_st = td->td_ru.ru_inblock;
909 	oublock_st = td->td_ru.ru_oublock;
910 	/*
911 	 * aio_aqueue() acquires a reference to the file that is
912 	 * released in aio_free_entry().
913 	 */
914 	if (cb->aio_lio_opcode == LIO_READ) {
915 		auio.uio_rw = UIO_READ;
916 		if (auio.uio_resid == 0)
917 			error = 0;
918 		else
919 			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
920 	} else {
921 		if (fp->f_type == DTYPE_VNODE)
922 			bwillwrite();
923 		auio.uio_rw = UIO_WRITE;
924 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
925 	}
926 	inblock_end = td->td_ru.ru_inblock;
927 	oublock_end = td->td_ru.ru_oublock;
928 
929 	aiocbe->inputcharge = inblock_end - inblock_st;
930 	aiocbe->outputcharge = oublock_end - oublock_st;
931 
932 	if ((error) && (auio.uio_resid != cnt)) {
933 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
934 			error = 0;
935 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
936 			int sigpipe = 1;
937 			if (fp->f_type == DTYPE_SOCKET) {
938 				so = fp->f_data;
939 				if (so->so_options & SO_NOSIGPIPE)
940 					sigpipe = 0;
941 			}
942 			if (sigpipe) {
943 				PROC_LOCK(aiocbe->userproc);
944 				kern_psignal(aiocbe->userproc, SIGPIPE);
945 				PROC_UNLOCK(aiocbe->userproc);
946 			}
947 		}
948 	}
949 
950 	cnt -= auio.uio_resid;
951 	cb->_aiocb_private.error = error;
952 	cb->_aiocb_private.status = cnt;
953 	td->td_ucred = td_savedcred;
954 }
955 
956 static void
957 aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
958 {
959 	struct aioliojob *lj;
960 	struct kaioinfo *ki;
961 	struct aiocblist *scb, *scbn;
962 	int lj_done;
963 
964 	ki = userp->p_aioinfo;
965 	AIO_LOCK_ASSERT(ki, MA_OWNED);
966 	lj = aiocbe->lio;
967 	lj_done = 0;
968 	if (lj) {
969 		lj->lioj_finished_count++;
970 		if (lj->lioj_count == lj->lioj_finished_count)
971 			lj_done = 1;
972 	}
973 	if (type == DONE_QUEUE) {
974 		aiocbe->jobflags |= AIOCBLIST_DONE;
975 	} else {
976 		aiocbe->jobflags |= AIOCBLIST_BUFDONE;
977 	}
978 	TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
979 	aiocbe->jobstate = JOBST_JOBFINISHED;
980 
981 	if (ki->kaio_flags & KAIO_RUNDOWN)
982 		goto notification_done;
983 
984 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
985 	    aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
986 		aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
987 
988 	KNOTE_LOCKED(&aiocbe->klist, 1);
989 
990 	if (lj_done) {
991 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
992 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
993 			KNOTE_LOCKED(&lj->klist, 1);
994 		}
995 		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
996 		    == LIOJ_SIGNAL
997 		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
998 		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
999 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
1000 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
1001 		}
1002 	}
1003 
1004 notification_done:
1005 	if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
1006 		TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
1007 			if (aiocbe->fd_file == scb->fd_file &&
1008 			    aiocbe->seqno < scb->seqno) {
1009 				if (--scb->pending == 0) {
1010 					mtx_lock(&aio_job_mtx);
1011 					scb->jobstate = JOBST_JOBQGLOBAL;
1012 					TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
1013 					TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
1014 					aio_kick_nowait(userp);
1015 					mtx_unlock(&aio_job_mtx);
1016 				}
1017 			}
1018 		}
1019 	}
1020 	if (ki->kaio_flags & KAIO_WAKEUP) {
1021 		ki->kaio_flags &= ~KAIO_WAKEUP;
1022 		wakeup(&userp->p_aioinfo);
1023 	}
1024 }
1025 
1026 /*
1027  * The AIO daemon, most of the actual work is done in aio_process,
1028  * but the setup (and address space mgmt) is done in this routine.
1029  */
1030 static void
1031 aio_daemon(void *_id)
1032 {
1033 	struct aiocblist *aiocbe;
1034 	struct aiothreadlist *aiop;
1035 	struct kaioinfo *ki;
1036 	struct proc *curcp, *mycp, *userp;
1037 	struct vmspace *myvm, *tmpvm;
1038 	struct thread *td = curthread;
1039 	int id = (intptr_t)_id;
1040 
1041 	/*
1042 	 * Local copies of curproc (cp) and vmspace (myvm)
1043 	 */
1044 	mycp = td->td_proc;
1045 	myvm = mycp->p_vmspace;
1046 
1047 	KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));
1048 
1049 	/*
1050 	 * Allocate and ready the aio control info.  There is one aiop structure
1051 	 * per daemon.
1052 	 */
1053 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
1054 	aiop->aiothread = td;
1055 	aiop->aiothreadflags = 0;
1056 
1057 	/* The daemon resides in its own pgrp. */
1058 	sys_setsid(td, NULL);
1059 
1060 	/*
1061 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
1062 	 * and creating too many daemons.)
1063 	 */
1064 	sema_post(&aio_newproc_sem);
1065 
1066 	mtx_lock(&aio_job_mtx);
1067 	for (;;) {
1068 		/*
1069 		 * curcp is the current daemon process context.
1070 		 * userp is the current user process context.
1071 		 */
1072 		curcp = mycp;
1073 
1074 		/*
1075 		 * Take daemon off of free queue
1076 		 */
1077 		if (aiop->aiothreadflags & AIOP_FREE) {
1078 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1079 			aiop->aiothreadflags &= ~AIOP_FREE;
1080 		}
1081 
1082 		/*
1083 		 * Check for jobs.
1084 		 */
1085 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
1086 			mtx_unlock(&aio_job_mtx);
1087 			userp = aiocbe->userproc;
1088 
1089 			/*
1090 			 * Connect to process address space for user program.
1091 			 */
1092 			if (userp != curcp) {
1093 				/*
1094 				 * Save the current address space that we are
1095 				 * connected to.
1096 				 */
1097 				tmpvm = mycp->p_vmspace;
1098 
1099 				/*
1100 				 * Point to the new user address space, and
1101 				 * refer to it.
1102 				 */
1103 				mycp->p_vmspace = userp->p_vmspace;
1104 				atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);
1105 
1106 				/* Activate the new mapping. */
1107 				pmap_activate(FIRST_THREAD_IN_PROC(mycp));
1108 
1109 				/*
1110 				 * If the old address space wasn't the daemons
1111 				 * own address space, then we need to remove the
1112 				 * daemon's reference from the other process
1113 				 * that it was acting on behalf of.
1114 				 */
1115 				if (tmpvm != myvm) {
1116 					vmspace_free(tmpvm);
1117 				}
1118 				curcp = userp;
1119 			}
1120 
1121 			ki = userp->p_aioinfo;
1122 
1123 			/* Do the I/O function. */
1124 			aio_process(aiocbe);
1125 
1126 			mtx_lock(&aio_job_mtx);
1127 			/* Decrement the active job count. */
1128 			ki->kaio_active_count--;
1129 			mtx_unlock(&aio_job_mtx);
1130 
1131 			AIO_LOCK(ki);
1132 			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
1133 			aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
1134 			AIO_UNLOCK(ki);
1135 
1136 			mtx_lock(&aio_job_mtx);
1137 		}
1138 
1139 		/*
1140 		 * Disconnect from user address space.
1141 		 */
1142 		if (curcp != mycp) {
1143 
1144 			mtx_unlock(&aio_job_mtx);
1145 
1146 			/* Get the user address space to disconnect from. */
1147 			tmpvm = mycp->p_vmspace;
1148 
1149 			/* Get original address space for daemon. */
1150 			mycp->p_vmspace = myvm;
1151 
1152 			/* Activate the daemon's address space. */
1153 			pmap_activate(FIRST_THREAD_IN_PROC(mycp));
1154 #ifdef DIAGNOSTIC
1155 			if (tmpvm == myvm) {
1156 				printf("AIOD: vmspace problem -- %d\n",
1157 				    mycp->p_pid);
1158 			}
1159 #endif
1160 			/* Remove our vmspace reference. */
1161 			vmspace_free(tmpvm);
1162 
1163 			curcp = mycp;
1164 
1165 			mtx_lock(&aio_job_mtx);
1166 			/*
1167 			 * We have to restart to avoid race, we only sleep if
1168 			 * no job can be selected, that should be
1169 			 * curcp == mycp.
1170 			 */
1171 			continue;
1172 		}
1173 
1174 		mtx_assert(&aio_job_mtx, MA_OWNED);
1175 
1176 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
1177 		aiop->aiothreadflags |= AIOP_FREE;
1178 
1179 		/*
1180 		 * If daemon is inactive for a long time, allow it to exit,
1181 		 * thereby freeing resources.
1182 		 */
1183 		if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
1184 		    aiod_lifetime)) {
1185 			if (TAILQ_EMPTY(&aio_jobs)) {
1186 				if ((aiop->aiothreadflags & AIOP_FREE) &&
1187 				    (num_aio_procs > target_aio_procs)) {
1188 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
1189 					num_aio_procs--;
1190 					mtx_unlock(&aio_job_mtx);
1191 					uma_zfree(aiop_zone, aiop);
1192 					free_unr(aiod_unr, id);
1193 #ifdef DIAGNOSTIC
1194 					if (mycp->p_vmspace->vm_refcnt <= 1) {
1195 						printf("AIOD: bad vm refcnt for"
1196 						    " exiting daemon: %d\n",
1197 						    mycp->p_vmspace->vm_refcnt);
1198 					}
1199 #endif
1200 					kproc_exit(0);
1201 				}
1202 			}
1203 		}
1204 	}
1205 	mtx_unlock(&aio_job_mtx);
1206 	panic("shouldn't be here\n");
1207 }
1208 
1209 /*
1210  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
1211  * AIO daemon modifies its environment itself.
1212  */
1213 static int
1214 aio_newproc(int *start)
1215 {
1216 	int error;
1217 	struct proc *p;
1218 	int id;
1219 
1220 	id = alloc_unr(aiod_unr);
1221 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
1222 		RFNOWAIT, 0, "aiod%d", id);
1223 	if (error == 0) {
1224 		/*
1225 		 * Wait until daemon is started.
1226 		 */
1227 		sema_wait(&aio_newproc_sem);
1228 		mtx_lock(&aio_job_mtx);
1229 		num_aio_procs++;
1230 		if (start != NULL)
1231 			(*start)--;
1232 		mtx_unlock(&aio_job_mtx);
1233 	} else {
1234 		free_unr(aiod_unr, id);
1235 	}
1236 	return (error);
1237 }
1238 
1239 /*
1240  * Try the high-performance, low-overhead physio method for eligible
1241  * VCHR devices.  This method doesn't use an aio helper thread, and
1242  * thus has very low overhead.
1243  *
1244  * Assumes that the caller, aio_aqueue(), has incremented the file
1245  * structure's reference count, preventing its deallocation for the
1246  * duration of this call.
1247  */
1248 static int
1249 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1250 {
1251 	struct aiocb *cb;
1252 	struct file *fp;
1253 	struct buf *bp;
1254 	struct vnode *vp;
1255 	struct kaioinfo *ki;
1256 	struct aioliojob *lj;
1257 	int error;
1258 
1259 	cb = &aiocbe->uaiocb;
1260 	fp = aiocbe->fd_file;
1261 
1262 	if (fp->f_type != DTYPE_VNODE)
1263 		return (-1);
1264 
1265 	vp = fp->f_vnode;
1266 
1267 	/*
1268 	 * If its not a disk, we don't want to return a positive error.
1269 	 * It causes the aio code to not fall through to try the thread
1270 	 * way when you're talking to a regular file.
1271 	 */
1272 	if (!vn_isdisk(vp, &error)) {
1273 		if (error == ENOTBLK)
1274 			return (-1);
1275 		else
1276 			return (error);
1277 	}
1278 
1279 	if (vp->v_bufobj.bo_bsize == 0)
1280 		return (-1);
1281 
1282  	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
1283 		return (-1);
1284 
1285 	if (cb->aio_nbytes > vp->v_rdev->si_iosize_max)
1286 		return (-1);
1287 
1288 	if (cb->aio_nbytes >
1289 	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1290 		return (-1);
1291 
1292 	ki = p->p_aioinfo;
1293 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1294 		return (-1);
1295 
1296 	/* Create and build a buffer header for a transfer. */
1297 	bp = (struct buf *)getpbuf(NULL);
1298 	BUF_KERNPROC(bp);
1299 
1300 	AIO_LOCK(ki);
1301 	ki->kaio_count++;
1302 	ki->kaio_buffer_count++;
1303 	lj = aiocbe->lio;
1304 	if (lj)
1305 		lj->lioj_count++;
1306 	AIO_UNLOCK(ki);
1307 
1308 	/*
1309 	 * Get a copy of the kva from the physical buffer.
1310 	 */
1311 	error = 0;
1312 
1313 	bp->b_bcount = cb->aio_nbytes;
1314 	bp->b_bufsize = cb->aio_nbytes;
1315 	bp->b_iodone = aio_physwakeup;
1316 	bp->b_saveaddr = bp->b_data;
1317 	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1318 	bp->b_offset = cb->aio_offset;
1319 	bp->b_iooffset = cb->aio_offset;
1320 	bp->b_blkno = btodb(cb->aio_offset);
1321 	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
1322 
1323 	/*
1324 	 * Bring buffer into kernel space.
1325 	 */
1326 	if (vmapbuf(bp, 1) < 0) {
1327 		error = EFAULT;
1328 		goto doerror;
1329 	}
1330 
1331 	AIO_LOCK(ki);
1332 	aiocbe->bp = bp;
1333 	bp->b_caller1 = (void *)aiocbe;
1334 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1335 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1336 	aiocbe->jobstate = JOBST_JOBQBUF;
1337 	cb->_aiocb_private.status = cb->aio_nbytes;
1338 	AIO_UNLOCK(ki);
1339 
1340 	atomic_add_int(&num_queue_count, 1);
1341 	atomic_add_int(&num_buf_aio, 1);
1342 
1343 	bp->b_error = 0;
1344 
1345 	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
1346 
1347 	/* Perform transfer. */
1348 	dev_strategy(vp->v_rdev, bp);
1349 	return (0);
1350 
1351 doerror:
1352 	AIO_LOCK(ki);
1353 	ki->kaio_count--;
1354 	ki->kaio_buffer_count--;
1355 	if (lj)
1356 		lj->lioj_count--;
1357 	aiocbe->bp = NULL;
1358 	AIO_UNLOCK(ki);
1359 	relpbuf(bp, NULL);
1360 	return (error);
1361 }
1362 
1363 /*
1364  * Wake up aio requests that may be serviceable now.
1365  */
1366 static void
1367 aio_swake_cb(struct socket *so, struct sockbuf *sb)
1368 {
1369 	struct aiocblist *cb, *cbn;
1370 	int opcode;
1371 
1372 	SOCKBUF_LOCK_ASSERT(sb);
1373 	if (sb == &so->so_snd)
1374 		opcode = LIO_WRITE;
1375 	else
1376 		opcode = LIO_READ;
1377 
1378 	sb->sb_flags &= ~SB_AIO;
1379 	mtx_lock(&aio_job_mtx);
1380 	TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
1381 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1382 			if (cb->jobstate != JOBST_JOBQSOCK)
1383 				panic("invalid queue value");
1384 			/* XXX
1385 			 * We don't have actual sockets backend yet,
1386 			 * so we simply move the requests to the generic
1387 			 * file I/O backend.
1388 			 */
1389 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1390 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1391 			aio_kick_nowait(cb->userproc);
1392 		}
1393 	}
1394 	mtx_unlock(&aio_job_mtx);
1395 }
1396 
1397 static int
1398 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
1399 {
1400 
1401 	/*
1402 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
1403 	 * supported by AIO with the old sigevent structure.
1404 	 */
1405 	nsig->sigev_notify = osig->sigev_notify;
1406 	switch (nsig->sigev_notify) {
1407 	case SIGEV_NONE:
1408 		break;
1409 	case SIGEV_SIGNAL:
1410 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
1411 		break;
1412 	case SIGEV_KEVENT:
1413 		nsig->sigev_notify_kqueue =
1414 		    osig->__sigev_u.__sigev_notify_kqueue;
1415 		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
1416 		break;
1417 	default:
1418 		return (EINVAL);
1419 	}
1420 	return (0);
1421 }
1422 
1423 static int
1424 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
1425 {
1426 	struct oaiocb *ojob;
1427 	int error;
1428 
1429 	bzero(kjob, sizeof(struct aiocb));
1430 	error = copyin(ujob, kjob, sizeof(struct oaiocb));
1431 	if (error)
1432 		return (error);
1433 	ojob = (struct oaiocb *)kjob;
1434 	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
1435 }
1436 
1437 static int
1438 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
1439 {
1440 
1441 	return (copyin(ujob, kjob, sizeof(struct aiocb)));
1442 }
1443 
1444 static long
1445 aiocb_fetch_status(struct aiocb *ujob)
1446 {
1447 
1448 	return (fuword(&ujob->_aiocb_private.status));
1449 }
1450 
1451 static long
1452 aiocb_fetch_error(struct aiocb *ujob)
1453 {
1454 
1455 	return (fuword(&ujob->_aiocb_private.error));
1456 }
1457 
1458 static int
1459 aiocb_store_status(struct aiocb *ujob, long status)
1460 {
1461 
1462 	return (suword(&ujob->_aiocb_private.status, status));
1463 }
1464 
1465 static int
1466 aiocb_store_error(struct aiocb *ujob, long error)
1467 {
1468 
1469 	return (suword(&ujob->_aiocb_private.error, error));
1470 }
1471 
1472 static int
1473 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
1474 {
1475 
1476 	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
1477 }
1478 
1479 static int
1480 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
1481 {
1482 
1483 	return (suword(ujobp, (long)ujob));
1484 }
1485 
1486 static struct aiocb_ops aiocb_ops = {
1487 	.copyin = aiocb_copyin,
1488 	.fetch_status = aiocb_fetch_status,
1489 	.fetch_error = aiocb_fetch_error,
1490 	.store_status = aiocb_store_status,
1491 	.store_error = aiocb_store_error,
1492 	.store_kernelinfo = aiocb_store_kernelinfo,
1493 	.store_aiocb = aiocb_store_aiocb,
1494 };
1495 
1496 static struct aiocb_ops aiocb_ops_osigevent = {
1497 	.copyin = aiocb_copyin_old_sigevent,
1498 	.fetch_status = aiocb_fetch_status,
1499 	.fetch_error = aiocb_fetch_error,
1500 	.store_status = aiocb_store_status,
1501 	.store_error = aiocb_store_error,
1502 	.store_kernelinfo = aiocb_store_kernelinfo,
1503 	.store_aiocb = aiocb_store_aiocb,
1504 };
1505 
1506 /*
1507  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1508  * technique is done in this code.
1509  */
1510 int
1511 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
1512 	int type, struct aiocb_ops *ops)
1513 {
1514 	struct proc *p = td->td_proc;
1515 	struct file *fp;
1516 	struct socket *so;
1517 	struct aiocblist *aiocbe, *cb;
1518 	struct kaioinfo *ki;
1519 	struct kevent kev;
1520 	struct sockbuf *sb;
1521 	int opcode;
1522 	int error;
1523 	int fd, kqfd;
1524 	int jid;
1525 	u_short evflags;
1526 
1527 	if (p->p_aioinfo == NULL)
1528 		aio_init_aioinfo(p);
1529 
1530 	ki = p->p_aioinfo;
1531 
1532 	ops->store_status(job, -1);
1533 	ops->store_error(job, 0);
1534 	ops->store_kernelinfo(job, -1);
1535 
1536 	if (num_queue_count >= max_queue_count ||
1537 	    ki->kaio_count >= ki->kaio_qallowed_count) {
1538 		ops->store_error(job, EAGAIN);
1539 		return (EAGAIN);
1540 	}
1541 
1542 	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
1543 	aiocbe->inputcharge = 0;
1544 	aiocbe->outputcharge = 0;
1545 	knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
1546 
1547 	error = ops->copyin(job, &aiocbe->uaiocb);
1548 	if (error) {
1549 		ops->store_error(job, error);
1550 		uma_zfree(aiocb_zone, aiocbe);
1551 		return (error);
1552 	}
1553 
1554 	/* XXX: aio_nbytes is later casted to signed types. */
1555 	if (aiocbe->uaiocb.aio_nbytes > INT_MAX) {
1556 		uma_zfree(aiocb_zone, aiocbe);
1557 		return (EINVAL);
1558 	}
1559 
1560 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
1561 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
1562 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
1563 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
1564 		ops->store_error(job, EINVAL);
1565 		uma_zfree(aiocb_zone, aiocbe);
1566 		return (EINVAL);
1567 	}
1568 
1569 	if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1570 	     aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
1571 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1572 		uma_zfree(aiocb_zone, aiocbe);
1573 		return (EINVAL);
1574 	}
1575 
1576 	ksiginfo_init(&aiocbe->ksi);
1577 
1578 	/* Save userspace address of the job info. */
1579 	aiocbe->uuaiocb = job;
1580 
1581 	/* Get the opcode. */
1582 	if (type != LIO_NOP)
1583 		aiocbe->uaiocb.aio_lio_opcode = type;
1584 	opcode = aiocbe->uaiocb.aio_lio_opcode;
1585 
1586 	/*
1587 	 * Validate the opcode and fetch the file object for the specified
1588 	 * file descriptor.
1589 	 *
1590 	 * XXXRW: Moved the opcode validation up here so that we don't
1591 	 * retrieve a file descriptor without knowing what the capabiltity
1592 	 * should be.
1593 	 */
1594 	fd = aiocbe->uaiocb.aio_fildes;
1595 	switch (opcode) {
1596 	case LIO_WRITE:
1597 		error = fget_write(td, fd, CAP_PWRITE, &fp);
1598 		break;
1599 	case LIO_READ:
1600 		error = fget_read(td, fd, CAP_PREAD, &fp);
1601 		break;
1602 	case LIO_SYNC:
1603 		error = fget(td, fd, CAP_FSYNC, &fp);
1604 		break;
1605 	case LIO_NOP:
1606 		error = fget(td, fd, CAP_NONE, &fp);
1607 		break;
1608 	default:
1609 		error = EINVAL;
1610 	}
1611 	if (error) {
1612 		uma_zfree(aiocb_zone, aiocbe);
1613 		ops->store_error(job, error);
1614 		return (error);
1615 	}
1616 
1617 	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
1618 		error = EINVAL;
1619 		goto aqueue_fail;
1620 	}
1621 
1622 	if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
1623 		error = EINVAL;
1624 		goto aqueue_fail;
1625 	}
1626 
1627 	aiocbe->fd_file = fp;
1628 
1629 	mtx_lock(&aio_job_mtx);
1630 	jid = jobrefid++;
1631 	aiocbe->seqno = jobseqno++;
1632 	mtx_unlock(&aio_job_mtx);
1633 	error = ops->store_kernelinfo(job, jid);
1634 	if (error) {
1635 		error = EINVAL;
1636 		goto aqueue_fail;
1637 	}
1638 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
1639 
1640 	if (opcode == LIO_NOP) {
1641 		fdrop(fp, td);
1642 		uma_zfree(aiocb_zone, aiocbe);
1643 		return (0);
1644 	}
1645 
1646 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
1647 		goto no_kqueue;
1648 	evflags = aiocbe->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
1649 	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
1650 		error = EINVAL;
1651 		goto aqueue_fail;
1652 	}
1653 	kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1654 	kev.ident = (uintptr_t)aiocbe->uuaiocb;
1655 	kev.filter = EVFILT_AIO;
1656 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
1657 	kev.data = (intptr_t)aiocbe;
1658 	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
1659 	error = kqfd_register(kqfd, &kev, td, 1);
1660 aqueue_fail:
1661 	if (error) {
1662 		fdrop(fp, td);
1663 		uma_zfree(aiocb_zone, aiocbe);
1664 		ops->store_error(job, error);
1665 		goto done;
1666 	}
1667 no_kqueue:
1668 
1669 	ops->store_error(job, EINPROGRESS);
1670 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1671 	aiocbe->userproc = p;
1672 	aiocbe->cred = crhold(td->td_ucred);
1673 	aiocbe->jobflags = 0;
1674 	aiocbe->lio = lj;
1675 
1676 	if (opcode == LIO_SYNC)
1677 		goto queueit;
1678 
1679 	if (fp->f_type == DTYPE_SOCKET) {
1680 		/*
1681 		 * Alternate queueing for socket ops: Reach down into the
1682 		 * descriptor to get the socket data.  Then check to see if the
1683 		 * socket is ready to be read or written (based on the requested
1684 		 * operation).
1685 		 *
1686 		 * If it is not ready for io, then queue the aiocbe on the
1687 		 * socket, and set the flags so we get a call when sbnotify()
1688 		 * happens.
1689 		 *
1690 		 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock
1691 		 * and unlock the snd sockbuf for no reason.
1692 		 */
1693 		so = fp->f_data;
1694 		sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
1695 		SOCKBUF_LOCK(sb);
1696 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1697 		    LIO_WRITE) && (!sowriteable(so)))) {
1698 			sb->sb_flags |= SB_AIO;
1699 
1700 			mtx_lock(&aio_job_mtx);
1701 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1702 			mtx_unlock(&aio_job_mtx);
1703 
1704 			AIO_LOCK(ki);
1705 			TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1706 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1707 			aiocbe->jobstate = JOBST_JOBQSOCK;
1708 			ki->kaio_count++;
1709 			if (lj)
1710 				lj->lioj_count++;
1711 			AIO_UNLOCK(ki);
1712 			SOCKBUF_UNLOCK(sb);
1713 			atomic_add_int(&num_queue_count, 1);
1714 			error = 0;
1715 			goto done;
1716 		}
1717 		SOCKBUF_UNLOCK(sb);
1718 	}
1719 
1720 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1721 		goto done;
1722 #if 0
1723 	if (error > 0) {
1724 		aiocbe->uaiocb._aiocb_private.error = error;
1725 		ops->store_error(job, error);
1726 		goto done;
1727 	}
1728 #endif
1729 queueit:
1730 	/* No buffer for daemon I/O. */
1731 	aiocbe->bp = NULL;
1732 	atomic_add_int(&num_queue_count, 1);
1733 
1734 	AIO_LOCK(ki);
1735 	ki->kaio_count++;
1736 	if (lj)
1737 		lj->lioj_count++;
1738 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1739 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1740 	if (opcode == LIO_SYNC) {
1741 		TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
1742 			if (cb->fd_file == aiocbe->fd_file &&
1743 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
1744 			    cb->seqno < aiocbe->seqno) {
1745 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
1746 				aiocbe->pending++;
1747 			}
1748 		}
1749 		TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
1750 			if (cb->fd_file == aiocbe->fd_file &&
1751 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
1752 			    cb->seqno < aiocbe->seqno) {
1753 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
1754 				aiocbe->pending++;
1755 			}
1756 		}
1757 		if (aiocbe->pending != 0) {
1758 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
1759 			aiocbe->jobstate = JOBST_JOBQSYNC;
1760 			AIO_UNLOCK(ki);
1761 			goto done;
1762 		}
1763 	}
1764 	mtx_lock(&aio_job_mtx);
1765 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1766 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1767 	aio_kick_nowait(p);
1768 	mtx_unlock(&aio_job_mtx);
1769 	AIO_UNLOCK(ki);
1770 	error = 0;
1771 done:
1772 	return (error);
1773 }
1774 
1775 static void
1776 aio_kick_nowait(struct proc *userp)
1777 {
1778 	struct kaioinfo *ki = userp->p_aioinfo;
1779 	struct aiothreadlist *aiop;
1780 
1781 	mtx_assert(&aio_job_mtx, MA_OWNED);
1782 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1783 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1784 		aiop->aiothreadflags &= ~AIOP_FREE;
1785 		wakeup(aiop->aiothread);
1786 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1787 	    ((ki->kaio_active_count + num_aio_resv_start) <
1788 	    ki->kaio_maxactive_count)) {
1789 		taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
1790 	}
1791 }
1792 
1793 static int
1794 aio_kick(struct proc *userp)
1795 {
1796 	struct kaioinfo *ki = userp->p_aioinfo;
1797 	struct aiothreadlist *aiop;
1798 	int error, ret = 0;
1799 
1800 	mtx_assert(&aio_job_mtx, MA_OWNED);
1801 retryproc:
1802 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1803 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1804 		aiop->aiothreadflags &= ~AIOP_FREE;
1805 		wakeup(aiop->aiothread);
1806 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1807 	    ((ki->kaio_active_count + num_aio_resv_start) <
1808 	    ki->kaio_maxactive_count)) {
1809 		num_aio_resv_start++;
1810 		mtx_unlock(&aio_job_mtx);
1811 		error = aio_newproc(&num_aio_resv_start);
1812 		mtx_lock(&aio_job_mtx);
1813 		if (error) {
1814 			num_aio_resv_start--;
1815 			goto retryproc;
1816 		}
1817 	} else {
1818 		ret = -1;
1819 	}
1820 	return (ret);
1821 }
1822 
1823 static void
1824 aio_kick_helper(void *context, int pending)
1825 {
1826 	struct proc *userp = context;
1827 
1828 	mtx_lock(&aio_job_mtx);
1829 	while (--pending >= 0) {
1830 		if (aio_kick(userp))
1831 			break;
1832 	}
1833 	mtx_unlock(&aio_job_mtx);
1834 }
1835 
1836 /*
1837  * Support the aio_return system call, as a side-effect, kernel resources are
1838  * released.
1839  */
1840 static int
1841 kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops)
1842 {
1843 	struct proc *p = td->td_proc;
1844 	struct aiocblist *cb;
1845 	struct kaioinfo *ki;
1846 	int status, error;
1847 
1848 	ki = p->p_aioinfo;
1849 	if (ki == NULL)
1850 		return (EINVAL);
1851 	AIO_LOCK(ki);
1852 	TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
1853 		if (cb->uuaiocb == uaiocb)
1854 			break;
1855 	}
1856 	if (cb != NULL) {
1857 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
1858 		status = cb->uaiocb._aiocb_private.status;
1859 		error = cb->uaiocb._aiocb_private.error;
1860 		td->td_retval[0] = status;
1861 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1862 			td->td_ru.ru_oublock += cb->outputcharge;
1863 			cb->outputcharge = 0;
1864 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1865 			td->td_ru.ru_inblock += cb->inputcharge;
1866 			cb->inputcharge = 0;
1867 		}
1868 		aio_free_entry(cb);
1869 		AIO_UNLOCK(ki);
1870 		ops->store_error(uaiocb, error);
1871 		ops->store_status(uaiocb, status);
1872 	} else {
1873 		error = EINVAL;
1874 		AIO_UNLOCK(ki);
1875 	}
1876 	return (error);
1877 }
1878 
1879 int
1880 sys_aio_return(struct thread *td, struct aio_return_args *uap)
1881 {
1882 
1883 	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
1884 }
1885 
1886 /*
1887  * Allow a process to wakeup when any of the I/O requests are completed.
1888  */
1889 static int
1890 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
1891     struct timespec *ts)
1892 {
1893 	struct proc *p = td->td_proc;
1894 	struct timeval atv;
1895 	struct kaioinfo *ki;
1896 	struct aiocblist *cb, *cbfirst;
1897 	int error, i, timo;
1898 
1899 	timo = 0;
1900 	if (ts) {
1901 		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
1902 			return (EINVAL);
1903 
1904 		TIMESPEC_TO_TIMEVAL(&atv, ts);
1905 		if (itimerfix(&atv))
1906 			return (EINVAL);
1907 		timo = tvtohz(&atv);
1908 	}
1909 
1910 	ki = p->p_aioinfo;
1911 	if (ki == NULL)
1912 		return (EAGAIN);
1913 
1914 	if (njoblist == 0)
1915 		return (0);
1916 
1917 	AIO_LOCK(ki);
1918 	for (;;) {
1919 		cbfirst = NULL;
1920 		error = 0;
1921 		TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
1922 			for (i = 0; i < njoblist; i++) {
1923 				if (cb->uuaiocb == ujoblist[i]) {
1924 					if (cbfirst == NULL)
1925 						cbfirst = cb;
1926 					if (cb->jobstate == JOBST_JOBFINISHED)
1927 						goto RETURN;
1928 				}
1929 			}
1930 		}
1931 		/* All tasks were finished. */
1932 		if (cbfirst == NULL)
1933 			break;
1934 
1935 		ki->kaio_flags |= KAIO_WAKEUP;
1936 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
1937 		    "aiospn", timo);
1938 		if (error == ERESTART)
1939 			error = EINTR;
1940 		if (error)
1941 			break;
1942 	}
1943 RETURN:
1944 	AIO_UNLOCK(ki);
1945 	return (error);
1946 }
1947 
1948 int
1949 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1950 {
1951 	struct timespec ts, *tsp;
1952 	struct aiocb **ujoblist;
1953 	int error;
1954 
1955 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
1956 		return (EINVAL);
1957 
1958 	if (uap->timeout) {
1959 		/* Get timespec struct. */
1960 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1961 			return (error);
1962 		tsp = &ts;
1963 	} else
1964 		tsp = NULL;
1965 
1966 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
1967 	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
1968 	if (error == 0)
1969 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
1970 	uma_zfree(aiol_zone, ujoblist);
1971 	return (error);
1972 }
1973 
1974 /*
1975  * aio_cancel cancels any non-physio aio operations not currently in
1976  * progress.
1977  */
1978 int
1979 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1980 {
1981 	struct proc *p = td->td_proc;
1982 	struct kaioinfo *ki;
1983 	struct aiocblist *cbe, *cbn;
1984 	struct file *fp;
1985 	struct socket *so;
1986 	int error;
1987 	int remove;
1988 	int cancelled = 0;
1989 	int notcancelled = 0;
1990 	struct vnode *vp;
1991 
1992 	/* Lookup file object. */
1993 	error = fget(td, uap->fd, 0, &fp);
1994 	if (error)
1995 		return (error);
1996 
1997 	ki = p->p_aioinfo;
1998 	if (ki == NULL)
1999 		goto done;
2000 
2001 	if (fp->f_type == DTYPE_VNODE) {
2002 		vp = fp->f_vnode;
2003 		if (vn_isdisk(vp, &error)) {
2004 			fdrop(fp, td);
2005 			td->td_retval[0] = AIO_NOTCANCELED;
2006 			return (0);
2007 		}
2008 	}
2009 
2010 	AIO_LOCK(ki);
2011 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
2012 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
2013 		    ((uap->aiocbp == NULL) ||
2014 		     (uap->aiocbp == cbe->uuaiocb))) {
2015 			remove = 0;
2016 
2017 			mtx_lock(&aio_job_mtx);
2018 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
2019 				TAILQ_REMOVE(&aio_jobs, cbe, list);
2020 				remove = 1;
2021 			} else if (cbe->jobstate == JOBST_JOBQSOCK) {
2022 				MPASS(fp->f_type == DTYPE_SOCKET);
2023 				so = fp->f_data;
2024 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
2025 				remove = 1;
2026 			} else if (cbe->jobstate == JOBST_JOBQSYNC) {
2027 				TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
2028 				remove = 1;
2029 			}
2030 			mtx_unlock(&aio_job_mtx);
2031 
2032 			if (remove) {
2033 				TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
2034 				cbe->uaiocb._aiocb_private.status = -1;
2035 				cbe->uaiocb._aiocb_private.error = ECANCELED;
2036 				aio_bio_done_notify(p, cbe, DONE_QUEUE);
2037 				cancelled++;
2038 			} else {
2039 				notcancelled++;
2040 			}
2041 			if (uap->aiocbp != NULL)
2042 				break;
2043 		}
2044 	}
2045 	AIO_UNLOCK(ki);
2046 
2047 done:
2048 	fdrop(fp, td);
2049 
2050 	if (uap->aiocbp != NULL) {
2051 		if (cancelled) {
2052 			td->td_retval[0] = AIO_CANCELED;
2053 			return (0);
2054 		}
2055 	}
2056 
2057 	if (notcancelled) {
2058 		td->td_retval[0] = AIO_NOTCANCELED;
2059 		return (0);
2060 	}
2061 
2062 	if (cancelled) {
2063 		td->td_retval[0] = AIO_CANCELED;
2064 		return (0);
2065 	}
2066 
2067 	td->td_retval[0] = AIO_ALLDONE;
2068 
2069 	return (0);
2070 }
2071 
2072 /*
2073  * aio_error is implemented in the kernel level for compatibility purposes
2074  * only.  For a user mode async implementation, it would be best to do it in
2075  * a userland subroutine.
2076  */
2077 static int
2078 kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops)
2079 {
2080 	struct proc *p = td->td_proc;
2081 	struct aiocblist *cb;
2082 	struct kaioinfo *ki;
2083 	int status;
2084 
2085 	ki = p->p_aioinfo;
2086 	if (ki == NULL) {
2087 		td->td_retval[0] = EINVAL;
2088 		return (0);
2089 	}
2090 
2091 	AIO_LOCK(ki);
2092 	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
2093 		if (cb->uuaiocb == aiocbp) {
2094 			if (cb->jobstate == JOBST_JOBFINISHED)
2095 				td->td_retval[0] =
2096 					cb->uaiocb._aiocb_private.error;
2097 			else
2098 				td->td_retval[0] = EINPROGRESS;
2099 			AIO_UNLOCK(ki);
2100 			return (0);
2101 		}
2102 	}
2103 	AIO_UNLOCK(ki);
2104 
2105 	/*
2106 	 * Hack for failure of aio_aqueue.
2107 	 */
2108 	status = ops->fetch_status(aiocbp);
2109 	if (status == -1) {
2110 		td->td_retval[0] = ops->fetch_error(aiocbp);
2111 		return (0);
2112 	}
2113 
2114 	td->td_retval[0] = EINVAL;
2115 	return (0);
2116 }
2117 
2118 int
2119 sys_aio_error(struct thread *td, struct aio_error_args *uap)
2120 {
2121 
2122 	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
2123 }
2124 
2125 /* syscall - asynchronous read from a file (REALTIME) */
2126 int
2127 sys_oaio_read(struct thread *td, struct oaio_read_args *uap)
2128 {
2129 
2130 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2131 	    &aiocb_ops_osigevent));
2132 }
2133 
2134 int
2135 sys_aio_read(struct thread *td, struct aio_read_args *uap)
2136 {
2137 
2138 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
2139 }
2140 
2141 /* syscall - asynchronous write to a file (REALTIME) */
2142 int
2143 sys_oaio_write(struct thread *td, struct oaio_write_args *uap)
2144 {
2145 
2146 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2147 	    &aiocb_ops_osigevent));
2148 }
2149 
2150 int
2151 sys_aio_write(struct thread *td, struct aio_write_args *uap)
2152 {
2153 
2154 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
2155 }
2156 
2157 static int
2158 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
2159     struct aiocb **acb_list, int nent, struct sigevent *sig,
2160     struct aiocb_ops *ops)
2161 {
2162 	struct proc *p = td->td_proc;
2163 	struct aiocb *iocb;
2164 	struct kaioinfo *ki;
2165 	struct aioliojob *lj;
2166 	struct kevent kev;
2167 	int error;
2168 	int nerror;
2169 	int i;
2170 
2171 	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
2172 		return (EINVAL);
2173 
2174 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2175 		return (EINVAL);
2176 
2177 	if (p->p_aioinfo == NULL)
2178 		aio_init_aioinfo(p);
2179 
2180 	ki = p->p_aioinfo;
2181 
2182 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
2183 	lj->lioj_flags = 0;
2184 	lj->lioj_count = 0;
2185 	lj->lioj_finished_count = 0;
2186 	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
2187 	ksiginfo_init(&lj->lioj_ksi);
2188 
2189 	/*
2190 	 * Setup signal.
2191 	 */
2192 	if (sig && (mode == LIO_NOWAIT)) {
2193 		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
2194 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2195 			/* Assume only new style KEVENT */
2196 			kev.filter = EVFILT_LIO;
2197 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
2198 			kev.ident = (uintptr_t)uacb_list; /* something unique */
2199 			kev.data = (intptr_t)lj;
2200 			/* pass user defined sigval data */
2201 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
2202 			error = kqfd_register(
2203 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
2204 			if (error) {
2205 				uma_zfree(aiolio_zone, lj);
2206 				return (error);
2207 			}
2208 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
2209 			;
2210 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2211 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
2212 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2213 					uma_zfree(aiolio_zone, lj);
2214 					return EINVAL;
2215 				}
2216 				lj->lioj_flags |= LIOJ_SIGNAL;
2217 		} else {
2218 			uma_zfree(aiolio_zone, lj);
2219 			return EINVAL;
2220 		}
2221 	}
2222 
2223 	AIO_LOCK(ki);
2224 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2225 	/*
2226 	 * Add extra aiocb count to avoid the lio to be freed
2227 	 * by other threads doing aio_waitcomplete or aio_return,
2228 	 * and prevent event from being sent until we have queued
2229 	 * all tasks.
2230 	 */
2231 	lj->lioj_count = 1;
2232 	AIO_UNLOCK(ki);
2233 
2234 	/*
2235 	 * Get pointers to the list of I/O requests.
2236 	 */
2237 	nerror = 0;
2238 	for (i = 0; i < nent; i++) {
2239 		iocb = acb_list[i];
2240 		if (iocb != NULL) {
2241 			error = aio_aqueue(td, iocb, lj, LIO_NOP, ops);
2242 			if (error != 0)
2243 				nerror++;
2244 		}
2245 	}
2246 
2247 	error = 0;
2248 	AIO_LOCK(ki);
2249 	if (mode == LIO_WAIT) {
2250 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
2251 			ki->kaio_flags |= KAIO_WAKEUP;
2252 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
2253 			    PRIBIO | PCATCH, "aiospn", 0);
2254 			if (error == ERESTART)
2255 				error = EINTR;
2256 			if (error)
2257 				break;
2258 		}
2259 	} else {
2260 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
2261 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2262 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
2263 				KNOTE_LOCKED(&lj->klist, 1);
2264 			}
2265 			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
2266 			    == LIOJ_SIGNAL
2267 			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2268 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
2269 				aio_sendsig(p, &lj->lioj_signal,
2270 					    &lj->lioj_ksi);
2271 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2272 			}
2273 		}
2274 	}
2275 	lj->lioj_count--;
2276 	if (lj->lioj_count == 0) {
2277 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
2278 		knlist_delete(&lj->klist, curthread, 1);
2279 		PROC_LOCK(p);
2280 		sigqueue_take(&lj->lioj_ksi);
2281 		PROC_UNLOCK(p);
2282 		AIO_UNLOCK(ki);
2283 		uma_zfree(aiolio_zone, lj);
2284 	} else
2285 		AIO_UNLOCK(ki);
2286 
2287 	if (nerror)
2288 		return (EIO);
2289 	return (error);
2290 }
2291 
2292 /* syscall - list directed I/O (REALTIME) */
2293 int
2294 sys_olio_listio(struct thread *td, struct olio_listio_args *uap)
2295 {
2296 	struct aiocb **acb_list;
2297 	struct sigevent *sigp, sig;
2298 	struct osigevent osig;
2299 	int error, nent;
2300 
2301 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2302 		return (EINVAL);
2303 
2304 	nent = uap->nent;
2305 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2306 		return (EINVAL);
2307 
2308 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2309 		error = copyin(uap->sig, &osig, sizeof(osig));
2310 		if (error)
2311 			return (error);
2312 		error = convert_old_sigevent(&osig, &sig);
2313 		if (error)
2314 			return (error);
2315 		sigp = &sig;
2316 	} else
2317 		sigp = NULL;
2318 
2319 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2320 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2321 	if (error == 0)
2322 		error = kern_lio_listio(td, uap->mode,
2323 		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2324 		    &aiocb_ops_osigevent);
2325 	free(acb_list, M_LIO);
2326 	return (error);
2327 }
2328 
2329 /* syscall - list directed I/O (REALTIME) */
2330 int
2331 sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
2332 {
2333 	struct aiocb **acb_list;
2334 	struct sigevent *sigp, sig;
2335 	int error, nent;
2336 
2337 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2338 		return (EINVAL);
2339 
2340 	nent = uap->nent;
2341 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2342 		return (EINVAL);
2343 
2344 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2345 		error = copyin(uap->sig, &sig, sizeof(sig));
2346 		if (error)
2347 			return (error);
2348 		sigp = &sig;
2349 	} else
2350 		sigp = NULL;
2351 
2352 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2353 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2354 	if (error == 0)
2355 		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
2356 		    nent, sigp, &aiocb_ops);
2357 	free(acb_list, M_LIO);
2358 	return (error);
2359 }
2360 
2361 /*
2362  * Called from interrupt thread for physio, we should return as fast
2363  * as possible, so we schedule a biohelper task.
2364  */
2365 static void
2366 aio_physwakeup(struct buf *bp)
2367 {
2368 	struct aiocblist *aiocbe;
2369 
2370 	aiocbe = (struct aiocblist *)bp->b_caller1;
2371 	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
2372 }
2373 
2374 /*
2375  * Task routine to perform heavy tasks, process wakeup, and signals.
2376  */
2377 static void
2378 biohelper(void *context, int pending)
2379 {
2380 	struct aiocblist *aiocbe = context;
2381 	struct buf *bp;
2382 	struct proc *userp;
2383 	struct kaioinfo *ki;
2384 	int nblks;
2385 
2386 	bp = aiocbe->bp;
2387 	userp = aiocbe->userproc;
2388 	ki = userp->p_aioinfo;
2389 	AIO_LOCK(ki);
2390 	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2391 	aiocbe->uaiocb._aiocb_private.error = 0;
2392 	if (bp->b_ioflags & BIO_ERROR)
2393 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2394 	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
2395 	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
2396 		aiocbe->outputcharge += nblks;
2397 	else
2398 		aiocbe->inputcharge += nblks;
2399 	aiocbe->bp = NULL;
2400 	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
2401 	ki->kaio_buffer_count--;
2402 	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
2403 	AIO_UNLOCK(ki);
2404 
2405 	/* Release mapping into kernel space. */
2406 	vunmapbuf(bp);
2407 	relpbuf(bp, NULL);
2408 	atomic_subtract_int(&num_buf_aio, 1);
2409 }
2410 
2411 /* syscall - wait for the next completion of an aio request */
2412 static int
2413 kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp,
2414     struct timespec *ts, struct aiocb_ops *ops)
2415 {
2416 	struct proc *p = td->td_proc;
2417 	struct timeval atv;
2418 	struct kaioinfo *ki;
2419 	struct aiocblist *cb;
2420 	struct aiocb *uuaiocb;
2421 	int error, status, timo;
2422 
2423 	ops->store_aiocb(aiocbp, NULL);
2424 
2425 	timo = 0;
2426 	if (ts) {
2427 		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
2428 			return (EINVAL);
2429 
2430 		TIMESPEC_TO_TIMEVAL(&atv, ts);
2431 		if (itimerfix(&atv))
2432 			return (EINVAL);
2433 		timo = tvtohz(&atv);
2434 	}
2435 
2436 	if (p->p_aioinfo == NULL)
2437 		aio_init_aioinfo(p);
2438 	ki = p->p_aioinfo;
2439 
2440 	error = 0;
2441 	cb = NULL;
2442 	AIO_LOCK(ki);
2443 	while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
2444 		ki->kaio_flags |= KAIO_WAKEUP;
2445 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2446 		    "aiowc", timo);
2447 		if (timo && error == ERESTART)
2448 			error = EINTR;
2449 		if (error)
2450 			break;
2451 	}
2452 
2453 	if (cb != NULL) {
2454 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
2455 		uuaiocb = cb->uuaiocb;
2456 		status = cb->uaiocb._aiocb_private.status;
2457 		error = cb->uaiocb._aiocb_private.error;
2458 		td->td_retval[0] = status;
2459 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2460 			td->td_ru.ru_oublock += cb->outputcharge;
2461 			cb->outputcharge = 0;
2462 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2463 			td->td_ru.ru_inblock += cb->inputcharge;
2464 			cb->inputcharge = 0;
2465 		}
2466 		aio_free_entry(cb);
2467 		AIO_UNLOCK(ki);
2468 		ops->store_aiocb(aiocbp, uuaiocb);
2469 		ops->store_error(uuaiocb, error);
2470 		ops->store_status(uuaiocb, status);
2471 	} else
2472 		AIO_UNLOCK(ki);
2473 
2474 	return (error);
2475 }
2476 
2477 int
2478 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2479 {
2480 	struct timespec ts, *tsp;
2481 	int error;
2482 
2483 	if (uap->timeout) {
2484 		/* Get timespec struct. */
2485 		error = copyin(uap->timeout, &ts, sizeof(ts));
2486 		if (error)
2487 			return (error);
2488 		tsp = &ts;
2489 	} else
2490 		tsp = NULL;
2491 
2492 	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
2493 }
2494 
2495 static int
2496 kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
2497     struct aiocb_ops *ops)
2498 {
2499 	struct proc *p = td->td_proc;
2500 	struct kaioinfo *ki;
2501 
2502 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
2503 		return (EINVAL);
2504 	ki = p->p_aioinfo;
2505 	if (ki == NULL)
2506 		aio_init_aioinfo(p);
2507 	return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
2508 }
2509 
2510 int
2511 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
2512 {
2513 
2514 	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
2515 }
2516 
2517 /* kqueue attach function */
2518 static int
2519 filt_aioattach(struct knote *kn)
2520 {
2521 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2522 
2523 	/*
2524 	 * The aiocbe pointer must be validated before using it, so
2525 	 * registration is restricted to the kernel; the user cannot
2526 	 * set EV_FLAG1.
2527 	 */
2528 	if ((kn->kn_flags & EV_FLAG1) == 0)
2529 		return (EPERM);
2530 	kn->kn_ptr.p_aio = aiocbe;
2531 	kn->kn_flags &= ~EV_FLAG1;
2532 
2533 	knlist_add(&aiocbe->klist, kn, 0);
2534 
2535 	return (0);
2536 }
2537 
2538 /* kqueue detach function */
2539 static void
2540 filt_aiodetach(struct knote *kn)
2541 {
2542 	struct knlist *knl;
2543 
2544 	knl = &kn->kn_ptr.p_aio->klist;
2545 	knl->kl_lock(knl->kl_lockarg);
2546 	if (!knlist_empty(knl))
2547 		knlist_remove(knl, kn, 1);
2548 	knl->kl_unlock(knl->kl_lockarg);
2549 }
2550 
2551 /* kqueue filter function */
2552 /*ARGSUSED*/
2553 static int
2554 filt_aio(struct knote *kn, long hint)
2555 {
2556 	struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
2557 
2558 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2559 	if (aiocbe->jobstate != JOBST_JOBFINISHED)
2560 		return (0);
2561 	kn->kn_flags |= EV_EOF;
2562 	return (1);
2563 }
2564 
2565 /* kqueue attach function */
2566 static int
2567 filt_lioattach(struct knote *kn)
2568 {
2569 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
2570 
2571 	/*
2572 	 * The aioliojob pointer must be validated before using it, so
2573 	 * registration is restricted to the kernel; the user cannot
2574 	 * set EV_FLAG1.
2575 	 */
2576 	if ((kn->kn_flags & EV_FLAG1) == 0)
2577 		return (EPERM);
2578 	kn->kn_ptr.p_lio = lj;
2579 	kn->kn_flags &= ~EV_FLAG1;
2580 
2581 	knlist_add(&lj->klist, kn, 0);
2582 
2583 	return (0);
2584 }
2585 
2586 /* kqueue detach function */
2587 static void
2588 filt_liodetach(struct knote *kn)
2589 {
2590 	struct knlist *knl;
2591 
2592 	knl = &kn->kn_ptr.p_lio->klist;
2593 	knl->kl_lock(knl->kl_lockarg);
2594 	if (!knlist_empty(knl))
2595 		knlist_remove(knl, kn, 1);
2596 	knl->kl_unlock(knl->kl_lockarg);
2597 }
2598 
2599 /* kqueue filter function */
2600 /*ARGSUSED*/
2601 static int
2602 filt_lio(struct knote *kn, long hint)
2603 {
2604 	struct aioliojob * lj = kn->kn_ptr.p_lio;
2605 
2606 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
2607 }
2608 
2609 #ifdef COMPAT_FREEBSD32
2610 
2611 struct __aiocb_private32 {
2612 	int32_t	status;
2613 	int32_t	error;
2614 	uint32_t kernelinfo;
2615 };
2616 
2617 typedef struct oaiocb32 {
2618 	int	aio_fildes;		/* File descriptor */
2619 	uint64_t aio_offset __packed;	/* File offset for I/O */
2620 	uint32_t aio_buf;		/* I/O buffer in process space */
2621 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2622 	struct	osigevent32 aio_sigevent; /* Signal to deliver */
2623 	int	aio_lio_opcode;		/* LIO opcode */
2624 	int	aio_reqprio;		/* Request priority -- ignored */
2625 	struct	__aiocb_private32 _aiocb_private;
2626 } oaiocb32_t;
2627 
2628 typedef struct aiocb32 {
2629 	int32_t	aio_fildes;		/* File descriptor */
2630 	uint64_t aio_offset __packed;	/* File offset for I/O */
2631 	uint32_t aio_buf;		/* I/O buffer in process space */
2632 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2633 	int	__spare__[2];
2634 	uint32_t __spare2__;
2635 	int	aio_lio_opcode;		/* LIO opcode */
2636 	int	aio_reqprio;		/* Request priority -- ignored */
2637 	struct __aiocb_private32 _aiocb_private;
2638 	struct sigevent32 aio_sigevent;	/* Signal to deliver */
2639 } aiocb32_t;
2640 
2641 static int
2642 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
2643 {
2644 
2645 	/*
2646 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
2647 	 * supported by AIO with the old sigevent structure.
2648 	 */
2649 	CP(*osig, *nsig, sigev_notify);
2650 	switch (nsig->sigev_notify) {
2651 	case SIGEV_NONE:
2652 		break;
2653 	case SIGEV_SIGNAL:
2654 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
2655 		break;
2656 	case SIGEV_KEVENT:
2657 		nsig->sigev_notify_kqueue =
2658 		    osig->__sigev_u.__sigev_notify_kqueue;
2659 		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
2660 		break;
2661 	default:
2662 		return (EINVAL);
2663 	}
2664 	return (0);
2665 }
2666 
2667 static int
2668 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
2669 {
2670 	struct oaiocb32 job32;
2671 	int error;
2672 
2673 	bzero(kjob, sizeof(struct aiocb));
2674 	error = copyin(ujob, &job32, sizeof(job32));
2675 	if (error)
2676 		return (error);
2677 
2678 	CP(job32, *kjob, aio_fildes);
2679 	CP(job32, *kjob, aio_offset);
2680 	PTRIN_CP(job32, *kjob, aio_buf);
2681 	CP(job32, *kjob, aio_nbytes);
2682 	CP(job32, *kjob, aio_lio_opcode);
2683 	CP(job32, *kjob, aio_reqprio);
2684 	CP(job32, *kjob, _aiocb_private.status);
2685 	CP(job32, *kjob, _aiocb_private.error);
2686 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2687 	return (convert_old_sigevent32(&job32.aio_sigevent,
2688 	    &kjob->aio_sigevent));
2689 }
2690 
2691 static int
2692 convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig)
2693 {
2694 
2695 	CP(*sig32, *sig, sigev_notify);
2696 	switch (sig->sigev_notify) {
2697 	case SIGEV_NONE:
2698 		break;
2699 	case SIGEV_THREAD_ID:
2700 		CP(*sig32, *sig, sigev_notify_thread_id);
2701 		/* FALLTHROUGH */
2702 	case SIGEV_SIGNAL:
2703 		CP(*sig32, *sig, sigev_signo);
2704 		break;
2705 	case SIGEV_KEVENT:
2706 		CP(*sig32, *sig, sigev_notify_kqueue);
2707 		CP(*sig32, *sig, sigev_notify_kevent_flags);
2708 		PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
2709 		break;
2710 	default:
2711 		return (EINVAL);
2712 	}
2713 	return (0);
2714 }
2715 
2716 static int
2717 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
2718 {
2719 	struct aiocb32 job32;
2720 	int error;
2721 
2722 	error = copyin(ujob, &job32, sizeof(job32));
2723 	if (error)
2724 		return (error);
2725 	CP(job32, *kjob, aio_fildes);
2726 	CP(job32, *kjob, aio_offset);
2727 	PTRIN_CP(job32, *kjob, aio_buf);
2728 	CP(job32, *kjob, aio_nbytes);
2729 	CP(job32, *kjob, aio_lio_opcode);
2730 	CP(job32, *kjob, aio_reqprio);
2731 	CP(job32, *kjob, _aiocb_private.status);
2732 	CP(job32, *kjob, _aiocb_private.error);
2733 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2734 	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
2735 }
2736 
2737 static long
2738 aiocb32_fetch_status(struct aiocb *ujob)
2739 {
2740 	struct aiocb32 *ujob32;
2741 
2742 	ujob32 = (struct aiocb32 *)ujob;
2743 	return (fuword32(&ujob32->_aiocb_private.status));
2744 }
2745 
2746 static long
2747 aiocb32_fetch_error(struct aiocb *ujob)
2748 {
2749 	struct aiocb32 *ujob32;
2750 
2751 	ujob32 = (struct aiocb32 *)ujob;
2752 	return (fuword32(&ujob32->_aiocb_private.error));
2753 }
2754 
2755 static int
2756 aiocb32_store_status(struct aiocb *ujob, long status)
2757 {
2758 	struct aiocb32 *ujob32;
2759 
2760 	ujob32 = (struct aiocb32 *)ujob;
2761 	return (suword32(&ujob32->_aiocb_private.status, status));
2762 }
2763 
2764 static int
2765 aiocb32_store_error(struct aiocb *ujob, long error)
2766 {
2767 	struct aiocb32 *ujob32;
2768 
2769 	ujob32 = (struct aiocb32 *)ujob;
2770 	return (suword32(&ujob32->_aiocb_private.error, error));
2771 }
2772 
2773 static int
2774 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
2775 {
2776 	struct aiocb32 *ujob32;
2777 
2778 	ujob32 = (struct aiocb32 *)ujob;
2779 	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
2780 }
2781 
2782 static int
2783 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
2784 {
2785 
2786 	return (suword32(ujobp, (long)ujob));
2787 }
2788 
2789 static struct aiocb_ops aiocb32_ops = {
2790 	.copyin = aiocb32_copyin,
2791 	.fetch_status = aiocb32_fetch_status,
2792 	.fetch_error = aiocb32_fetch_error,
2793 	.store_status = aiocb32_store_status,
2794 	.store_error = aiocb32_store_error,
2795 	.store_kernelinfo = aiocb32_store_kernelinfo,
2796 	.store_aiocb = aiocb32_store_aiocb,
2797 };
2798 
2799 static struct aiocb_ops aiocb32_ops_osigevent = {
2800 	.copyin = aiocb32_copyin_old_sigevent,
2801 	.fetch_status = aiocb32_fetch_status,
2802 	.fetch_error = aiocb32_fetch_error,
2803 	.store_status = aiocb32_store_status,
2804 	.store_error = aiocb32_store_error,
2805 	.store_kernelinfo = aiocb32_store_kernelinfo,
2806 	.store_aiocb = aiocb32_store_aiocb,
2807 };
2808 
2809 int
2810 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
2811 {
2812 
2813 	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2814 }
2815 
2816 int
2817 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
2818 {
2819 	struct timespec32 ts32;
2820 	struct timespec ts, *tsp;
2821 	struct aiocb **ujoblist;
2822 	uint32_t *ujoblist32;
2823 	int error, i;
2824 
2825 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
2826 		return (EINVAL);
2827 
2828 	if (uap->timeout) {
2829 		/* Get timespec struct. */
2830 		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
2831 			return (error);
2832 		CP(ts32, ts, tv_sec);
2833 		CP(ts32, ts, tv_nsec);
2834 		tsp = &ts;
2835 	} else
2836 		tsp = NULL;
2837 
2838 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
2839 	ujoblist32 = (uint32_t *)ujoblist;
2840 	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
2841 	    sizeof(ujoblist32[0]));
2842 	if (error == 0) {
2843 		for (i = uap->nent; i > 0; i--)
2844 			ujoblist[i] = PTRIN(ujoblist32[i]);
2845 
2846 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2847 	}
2848 	uma_zfree(aiol_zone, ujoblist);
2849 	return (error);
2850 }
2851 
2852 int
2853 freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap)
2854 {
2855 
2856 	return (sys_aio_cancel(td, (struct aio_cancel_args *)uap));
2857 }
2858 
2859 int
2860 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
2861 {
2862 
2863 	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2864 }
2865 
2866 int
2867 freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap)
2868 {
2869 
2870 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2871 	    &aiocb32_ops_osigevent));
2872 }
2873 
2874 int
2875 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
2876 {
2877 
2878 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2879 	    &aiocb32_ops));
2880 }
2881 
2882 int
2883 freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap)
2884 {
2885 
2886 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2887 	    &aiocb32_ops_osigevent));
2888 }
2889 
2890 int
2891 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
2892 {
2893 
2894 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2895 	    &aiocb32_ops));
2896 }
2897 
2898 int
2899 freebsd32_aio_waitcomplete(struct thread *td,
2900     struct freebsd32_aio_waitcomplete_args *uap)
2901 {
2902 	struct timespec32 ts32;
2903 	struct timespec ts, *tsp;
2904 	int error;
2905 
2906 	if (uap->timeout) {
2907 		/* Get timespec struct. */
2908 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
2909 		if (error)
2910 			return (error);
2911 		CP(ts32, ts, tv_sec);
2912 		CP(ts32, ts, tv_nsec);
2913 		tsp = &ts;
2914 	} else
2915 		tsp = NULL;
2916 
2917 	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
2918 	    &aiocb32_ops));
2919 }
2920 
2921 int
2922 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
2923 {
2924 
2925 	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
2926 	    &aiocb32_ops));
2927 }
2928 
2929 int
2930 freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap)
2931 {
2932 	struct aiocb **acb_list;
2933 	struct sigevent *sigp, sig;
2934 	struct osigevent32 osig;
2935 	uint32_t *acb_list32;
2936 	int error, i, nent;
2937 
2938 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2939 		return (EINVAL);
2940 
2941 	nent = uap->nent;
2942 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2943 		return (EINVAL);
2944 
2945 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2946 		error = copyin(uap->sig, &osig, sizeof(osig));
2947 		if (error)
2948 			return (error);
2949 		error = convert_old_sigevent32(&osig, &sig);
2950 		if (error)
2951 			return (error);
2952 		sigp = &sig;
2953 	} else
2954 		sigp = NULL;
2955 
2956 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
2957 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
2958 	if (error) {
2959 		free(acb_list32, M_LIO);
2960 		return (error);
2961 	}
2962 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2963 	for (i = 0; i < nent; i++)
2964 		acb_list[i] = PTRIN(acb_list32[i]);
2965 	free(acb_list32, M_LIO);
2966 
2967 	error = kern_lio_listio(td, uap->mode,
2968 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2969 	    &aiocb32_ops_osigevent);
2970 	free(acb_list, M_LIO);
2971 	return (error);
2972 }
2973 
2974 int
2975 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
2976 {
2977 	struct aiocb **acb_list;
2978 	struct sigevent *sigp, sig;
2979 	struct sigevent32 sig32;
2980 	uint32_t *acb_list32;
2981 	int error, i, nent;
2982 
2983 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2984 		return (EINVAL);
2985 
2986 	nent = uap->nent;
2987 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2988 		return (EINVAL);
2989 
2990 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2991 		error = copyin(uap->sig, &sig32, sizeof(sig32));
2992 		if (error)
2993 			return (error);
2994 		error = convert_sigevent32(&sig32, &sig);
2995 		if (error)
2996 			return (error);
2997 		sigp = &sig;
2998 	} else
2999 		sigp = NULL;
3000 
3001 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
3002 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
3003 	if (error) {
3004 		free(acb_list32, M_LIO);
3005 		return (error);
3006 	}
3007 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
3008 	for (i = 0; i < nent; i++)
3009 		acb_list[i] = PTRIN(acb_list32[i]);
3010 	free(acb_list32, M_LIO);
3011 
3012 	error = kern_lio_listio(td, uap->mode,
3013 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
3014 	    &aiocb32_ops);
3015 	free(acb_list, M_LIO);
3016 	return (error);
3017 }
3018 
3019 #endif
3020