xref: /freebsd/sys/kern/vfs_aio.c (revision 1f4bcc459a76b7aa664f3fd557684cd0ba6da352)
1 /*-
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  */
16 
17 /*
18  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
19  */
20 
21 #include <sys/cdefs.h>
22 __FBSDID("$FreeBSD$");
23 
24 #include "opt_compat.h"
25 
26 #include <sys/param.h>
27 #include <sys/systm.h>
28 #include <sys/malloc.h>
29 #include <sys/bio.h>
30 #include <sys/buf.h>
31 #include <sys/capsicum.h>
32 #include <sys/eventhandler.h>
33 #include <sys/sysproto.h>
34 #include <sys/filedesc.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/kthread.h>
38 #include <sys/fcntl.h>
39 #include <sys/file.h>
40 #include <sys/limits.h>
41 #include <sys/lock.h>
42 #include <sys/mutex.h>
43 #include <sys/unistd.h>
44 #include <sys/posix4.h>
45 #include <sys/proc.h>
46 #include <sys/resourcevar.h>
47 #include <sys/signalvar.h>
48 #include <sys/protosw.h>
49 #include <sys/rwlock.h>
50 #include <sys/sema.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <sys/syscall.h>
54 #include <sys/sysent.h>
55 #include <sys/sysctl.h>
56 #include <sys/sx.h>
57 #include <sys/taskqueue.h>
58 #include <sys/vnode.h>
59 #include <sys/conf.h>
60 #include <sys/event.h>
61 #include <sys/mount.h>
62 #include <geom/geom.h>
63 
64 #include <machine/atomic.h>
65 
66 #include <vm/vm.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_extern.h>
69 #include <vm/pmap.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_object.h>
72 #include <vm/uma.h>
73 #include <sys/aio.h>
74 
75 #include "opt_vfs_aio.h"
76 
77 /*
78  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
79  * overflow. (XXX will be removed soon.)
80  */
81 static u_long jobrefid;
82 
83 /*
84  * Counter for aio_fsync.
85  */
86 static uint64_t jobseqno;
87 
88 #define JOBST_NULL		0
89 #define JOBST_JOBQSOCK		1
90 #define JOBST_JOBQGLOBAL	2
91 #define JOBST_JOBRUNNING	3
92 #define JOBST_JOBFINISHED	4
93 #define JOBST_JOBQBUF		5
94 #define JOBST_JOBQSYNC		6
95 
96 #ifndef MAX_AIO_PER_PROC
97 #define MAX_AIO_PER_PROC	32
98 #endif
99 
100 #ifndef MAX_AIO_QUEUE_PER_PROC
101 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
102 #endif
103 
104 #ifndef MAX_AIO_PROCS
105 #define MAX_AIO_PROCS		32
106 #endif
107 
108 #ifndef MAX_AIO_QUEUE
109 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
110 #endif
111 
112 #ifndef TARGET_AIO_PROCS
113 #define TARGET_AIO_PROCS	4
114 #endif
115 
116 #ifndef MAX_BUF_AIO
117 #define MAX_BUF_AIO		16
118 #endif
119 
120 #ifndef AIOD_LIFETIME_DEFAULT
121 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
122 #endif
123 
124 FEATURE(aio, "Asynchronous I/O");
125 
126 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
127 
128 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
129     "Async IO management");
130 
131 static int max_aio_procs = MAX_AIO_PROCS;
132 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
133     "Maximum number of kernel processes to use for handling async IO ");
134 
135 static int num_aio_procs = 0;
136 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
137     "Number of presently active kernel processes for async IO");
138 
139 /*
140  * The code will adjust the actual number of AIO processes towards this
141  * number when it gets a chance.
142  */
143 static int target_aio_procs = TARGET_AIO_PROCS;
144 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
145     0,
146     "Preferred number of ready kernel processes for async IO");
147 
148 static int max_queue_count = MAX_AIO_QUEUE;
149 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
150     "Maximum number of aio requests to queue, globally");
151 
152 static int num_queue_count = 0;
153 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
154     "Number of queued aio requests");
155 
156 static int num_buf_aio = 0;
157 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
158     "Number of aio requests presently handled by the buf subsystem");
159 
160 /* Number of async I/O processes in the process of being started */
161 /* XXX This should be local to aio_aqueue() */
162 static int num_aio_resv_start = 0;
163 
164 static int aiod_lifetime;
165 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
166     "Maximum lifetime for idle aiod");
167 
168 static int unloadable = 0;
169 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
170     "Allow unload of aio (not recommended)");
171 
172 
173 static int max_aio_per_proc = MAX_AIO_PER_PROC;
174 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
175     0,
176     "Maximum active aio requests per process (stored in the process)");
177 
178 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
179 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
180     &max_aio_queue_per_proc, 0,
181     "Maximum queued aio requests per process (stored in the process)");
182 
183 static int max_buf_aio = MAX_BUF_AIO;
184 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
185     "Maximum buf aio requests per process (stored in the process)");
186 
187 typedef struct oaiocb {
188 	int	aio_fildes;		/* File descriptor */
189 	off_t	aio_offset;		/* File offset for I/O */
190 	volatile void *aio_buf;         /* I/O buffer in process space */
191 	size_t	aio_nbytes;		/* Number of bytes for I/O */
192 	struct	osigevent aio_sigevent;	/* Signal to deliver */
193 	int	aio_lio_opcode;		/* LIO opcode */
194 	int	aio_reqprio;		/* Request priority -- ignored */
195 	struct	__aiocb_private	_aiocb_private;
196 } oaiocb_t;
197 
198 /*
199  * Below is a key of locks used to protect each member of struct aiocblist
200  * aioliojob and kaioinfo and any backends.
201  *
202  * * - need not protected
203  * a - locked by kaioinfo lock
204  * b - locked by backend lock, the backend lock can be null in some cases,
205  *     for example, BIO belongs to this type, in this case, proc lock is
206  *     reused.
207  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
208  */
209 
210 /*
211  * Current, there is only two backends: BIO and generic file I/O.
212  * socket I/O is served by generic file I/O, this is not a good idea, since
213  * disk file I/O and any other types without O_NONBLOCK flag can block daemon
214  * processes, if there is no thread to serve socket I/O, the socket I/O will be
215  * delayed too long or starved, we should create some processes dedicated to
216  * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
217  * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
218  * structure is not safe because there is race between userland and aio
219  * daemons.
220  */
221 
222 struct aiocblist {
223 	TAILQ_ENTRY(aiocblist) list;	/* (b) internal list of for backend */
224 	TAILQ_ENTRY(aiocblist) plist;	/* (a) list of jobs for each backend */
225 	TAILQ_ENTRY(aiocblist) allist;  /* (a) list of all jobs in proc */
226 	int	jobflags;		/* (a) job flags */
227 	int	jobstate;		/* (b) job state */
228 	int	inputcharge;		/* (*) input blockes */
229 	int	outputcharge;		/* (*) output blockes */
230 	struct	bio *bp;		/* (*) BIO backend BIO pointer */
231 	struct	buf *pbuf;		/* (*) BIO backend buffer pointer */
232 	struct	vm_page *pages[btoc(MAXPHYS)+1]; /* BIO backend pages */
233 	int	npages;			/* BIO backend number of pages */
234 	struct	proc *userproc;		/* (*) user process */
235 	struct	ucred *cred;		/* (*) active credential when created */
236 	struct	file *fd_file;		/* (*) pointer to file structure */
237 	struct	aioliojob *lio;		/* (*) optional lio job */
238 	struct	aiocb *uuaiocb;		/* (*) pointer in userspace of aiocb */
239 	struct	knlist klist;		/* (a) list of knotes */
240 	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
241 	ksiginfo_t ksi;			/* (a) realtime signal info */
242 	uint64_t seqno;			/* (*) job number */
243 	int	pending;		/* (a) number of pending I/O, aio_fsync only */
244 };
245 
246 /* jobflags */
247 #define AIOCBLIST_DONE		0x01
248 #define AIOCBLIST_BUFDONE	0x02
249 #define AIOCBLIST_RUNDOWN	0x04
250 #define AIOCBLIST_CHECKSYNC	0x08
251 
252 /*
253  * AIO process info
254  */
255 #define AIOP_FREE	0x1			/* proc on free queue */
256 
257 struct aioproc {
258 	int	aioprocflags;			/* (c) AIO proc flags */
259 	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
260 	struct	proc *aioproc;			/* (*) the AIO proc */
261 };
262 
263 /*
264  * data-structure for lio signal management
265  */
266 struct aioliojob {
267 	int	lioj_flags;			/* (a) listio flags */
268 	int	lioj_count;			/* (a) listio flags */
269 	int	lioj_finished_count;		/* (a) listio flags */
270 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
271 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
272 	struct	knlist klist;			/* (a) list of knotes */
273 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
274 };
275 
276 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
277 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
278 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
279 
280 /*
281  * per process aio data structure
282  */
283 struct kaioinfo {
284 	struct	mtx kaio_mtx;		/* the lock to protect this struct */
285 	int	kaio_flags;		/* (a) per process kaio flags */
286 	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
287 	int	kaio_active_count;	/* (c) number of currently used AIOs */
288 	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
289 	int	kaio_count;		/* (a) size of AIO queue */
290 	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
291 	int	kaio_buffer_count;	/* (a) number of physio buffers */
292 	TAILQ_HEAD(,aiocblist) kaio_all;	/* (a) all AIOs in a process */
293 	TAILQ_HEAD(,aiocblist) kaio_done;	/* (a) done queue for process */
294 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
295 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* (a) job queue for process */
296 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* (a) buffer job queue */
297 	TAILQ_HEAD(,aiocblist) kaio_syncqueue;	/* (a) queue for aio_fsync */
298 	struct	task kaio_task;		/* (*) task to kick aio processes */
299 };
300 
301 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
302 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
303 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
304 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
305 
306 #define KAIO_RUNDOWN	0x1	/* process is being run down */
307 #define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
308 
309 /*
310  * Operations used to interact with userland aio control blocks.
311  * Different ABIs provide their own operations.
312  */
313 struct aiocb_ops {
314 	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
315 	long	(*fetch_status)(struct aiocb *ujob);
316 	long	(*fetch_error)(struct aiocb *ujob);
317 	int	(*store_status)(struct aiocb *ujob, long status);
318 	int	(*store_error)(struct aiocb *ujob, long error);
319 	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
320 	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
321 };
322 
323 static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
324 static struct sema aio_newproc_sem;
325 static struct mtx aio_job_mtx;
326 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* (c) Async job list */
327 static struct unrhdr *aiod_unr;
328 
329 void		aio_init_aioinfo(struct proc *p);
330 static int	aio_onceonly(void);
331 static int	aio_free_entry(struct aiocblist *aiocbe);
332 static void	aio_process_rw(struct aiocblist *aiocbe);
333 static void	aio_process_sync(struct aiocblist *aiocbe);
334 static void	aio_process_mlock(struct aiocblist *aiocbe);
335 static int	aio_newproc(int *);
336 int		aio_aqueue(struct thread *td, struct aiocb *job,
337 		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
338 static void	aio_physwakeup(struct bio *bp);
339 static void	aio_proc_rundown(void *arg, struct proc *p);
340 static void	aio_proc_rundown_exec(void *arg, struct proc *p,
341 		    struct image_params *imgp);
342 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
343 static void	aio_daemon(void *param);
344 static void	aio_swake_cb(struct socket *, struct sockbuf *);
345 static int	aio_unload(void);
346 static void	aio_bio_done_notify(struct proc *userp,
347 		    struct aiocblist *aiocbe, int type);
348 #define DONE_BUF	1
349 #define DONE_QUEUE	2
350 static int	aio_kick(struct proc *userp);
351 static void	aio_kick_nowait(struct proc *userp);
352 static void	aio_kick_helper(void *context, int pending);
353 static int	filt_aioattach(struct knote *kn);
354 static void	filt_aiodetach(struct knote *kn);
355 static int	filt_aio(struct knote *kn, long hint);
356 static int	filt_lioattach(struct knote *kn);
357 static void	filt_liodetach(struct knote *kn);
358 static int	filt_lio(struct knote *kn, long hint);
359 
360 /*
361  * Zones for:
362  * 	kaio	Per process async io info
363  *	aiop	async io process data
364  *	aiocb	async io jobs
365  *	aiol	list io job pointer - internal to aio_suspend XXX
366  *	aiolio	list io jobs
367  */
368 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
369 
370 /* kqueue filters for aio */
371 static struct filterops aio_filtops = {
372 	.f_isfd = 0,
373 	.f_attach = filt_aioattach,
374 	.f_detach = filt_aiodetach,
375 	.f_event = filt_aio,
376 };
377 static struct filterops lio_filtops = {
378 	.f_isfd = 0,
379 	.f_attach = filt_lioattach,
380 	.f_detach = filt_liodetach,
381 	.f_event = filt_lio
382 };
383 
384 static eventhandler_tag exit_tag, exec_tag;
385 
386 TASKQUEUE_DEFINE_THREAD(aiod_kick);
387 
388 /*
389  * Main operations function for use as a kernel module.
390  */
391 static int
392 aio_modload(struct module *module, int cmd, void *arg)
393 {
394 	int error = 0;
395 
396 	switch (cmd) {
397 	case MOD_LOAD:
398 		aio_onceonly();
399 		break;
400 	case MOD_UNLOAD:
401 		error = aio_unload();
402 		break;
403 	case MOD_SHUTDOWN:
404 		break;
405 	default:
406 		error = EINVAL;
407 		break;
408 	}
409 	return (error);
410 }
411 
412 static moduledata_t aio_mod = {
413 	"aio",
414 	&aio_modload,
415 	NULL
416 };
417 
418 static struct syscall_helper_data aio_syscalls[] = {
419 	SYSCALL_INIT_HELPER(aio_cancel),
420 	SYSCALL_INIT_HELPER(aio_error),
421 	SYSCALL_INIT_HELPER(aio_fsync),
422 	SYSCALL_INIT_HELPER(aio_mlock),
423 	SYSCALL_INIT_HELPER(aio_read),
424 	SYSCALL_INIT_HELPER(aio_return),
425 	SYSCALL_INIT_HELPER(aio_suspend),
426 	SYSCALL_INIT_HELPER(aio_waitcomplete),
427 	SYSCALL_INIT_HELPER(aio_write),
428 	SYSCALL_INIT_HELPER(lio_listio),
429 	SYSCALL_INIT_HELPER(oaio_read),
430 	SYSCALL_INIT_HELPER(oaio_write),
431 	SYSCALL_INIT_HELPER(olio_listio),
432 	SYSCALL_INIT_LAST
433 };
434 
435 #ifdef COMPAT_FREEBSD32
436 #include <sys/mount.h>
437 #include <sys/socket.h>
438 #include <compat/freebsd32/freebsd32.h>
439 #include <compat/freebsd32/freebsd32_proto.h>
440 #include <compat/freebsd32/freebsd32_signal.h>
441 #include <compat/freebsd32/freebsd32_syscall.h>
442 #include <compat/freebsd32/freebsd32_util.h>
443 
444 static struct syscall_helper_data aio32_syscalls[] = {
445 	SYSCALL32_INIT_HELPER(freebsd32_aio_return),
446 	SYSCALL32_INIT_HELPER(freebsd32_aio_suspend),
447 	SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
448 	SYSCALL32_INIT_HELPER(freebsd32_aio_error),
449 	SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
450 	SYSCALL32_INIT_HELPER(freebsd32_aio_mlock),
451 	SYSCALL32_INIT_HELPER(freebsd32_aio_read),
452 	SYSCALL32_INIT_HELPER(freebsd32_aio_write),
453 	SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
454 	SYSCALL32_INIT_HELPER(freebsd32_lio_listio),
455 	SYSCALL32_INIT_HELPER(freebsd32_oaio_read),
456 	SYSCALL32_INIT_HELPER(freebsd32_oaio_write),
457 	SYSCALL32_INIT_HELPER(freebsd32_olio_listio),
458 	SYSCALL_INIT_LAST
459 };
460 #endif
461 
462 DECLARE_MODULE(aio, aio_mod,
463 	SI_SUB_VFS, SI_ORDER_ANY);
464 MODULE_VERSION(aio, 1);
465 
466 /*
467  * Startup initialization
468  */
469 static int
470 aio_onceonly(void)
471 {
472 	int error;
473 
474 	/* XXX: should probably just use so->callback */
475 	aio_swake = &aio_swake_cb;
476 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
477 	    EVENTHANDLER_PRI_ANY);
478 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
479 	    NULL, EVENTHANDLER_PRI_ANY);
480 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
481 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
482 	TAILQ_INIT(&aio_freeproc);
483 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
484 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
485 	TAILQ_INIT(&aio_jobs);
486 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
487 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
488 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
489 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
490 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
491 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
492 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
493 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
494 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
495 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
496 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
497 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
498 	jobrefid = 1;
499 	async_io_version = _POSIX_VERSION;
500 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
501 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
502 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
503 
504 	error = syscall_helper_register(aio_syscalls, SY_THR_STATIC_KLD);
505 	if (error)
506 		return (error);
507 #ifdef COMPAT_FREEBSD32
508 	error = syscall32_helper_register(aio32_syscalls, SY_THR_STATIC_KLD);
509 	if (error)
510 		return (error);
511 #endif
512 	return (0);
513 }
514 
515 /*
516  * Callback for unload of AIO when used as a module.
517  */
518 static int
519 aio_unload(void)
520 {
521 	int error;
522 
523 	/*
524 	 * XXX: no unloads by default, it's too dangerous.
525 	 * perhaps we could do it if locked out callers and then
526 	 * did an aio_proc_rundown() on each process.
527 	 *
528 	 * jhb: aio_proc_rundown() needs to run on curproc though,
529 	 * so I don't think that would fly.
530 	 */
531 	if (!unloadable)
532 		return (EOPNOTSUPP);
533 
534 #ifdef COMPAT_FREEBSD32
535 	syscall32_helper_unregister(aio32_syscalls);
536 #endif
537 	syscall_helper_unregister(aio_syscalls);
538 
539 	error = kqueue_del_filteropts(EVFILT_AIO);
540 	if (error)
541 		return error;
542 	error = kqueue_del_filteropts(EVFILT_LIO);
543 	if (error)
544 		return error;
545 	async_io_version = 0;
546 	aio_swake = NULL;
547 	taskqueue_free(taskqueue_aiod_kick);
548 	delete_unrhdr(aiod_unr);
549 	uma_zdestroy(kaio_zone);
550 	uma_zdestroy(aiop_zone);
551 	uma_zdestroy(aiocb_zone);
552 	uma_zdestroy(aiol_zone);
553 	uma_zdestroy(aiolio_zone);
554 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
555 	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
556 	mtx_destroy(&aio_job_mtx);
557 	sema_destroy(&aio_newproc_sem);
558 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
559 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
560 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
561 	return (0);
562 }
563 
564 /*
565  * Init the per-process aioinfo structure.  The aioinfo limits are set
566  * per-process for user limit (resource) management.
567  */
568 void
569 aio_init_aioinfo(struct proc *p)
570 {
571 	struct kaioinfo *ki;
572 
573 	ki = uma_zalloc(kaio_zone, M_WAITOK);
574 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
575 	ki->kaio_flags = 0;
576 	ki->kaio_maxactive_count = max_aio_per_proc;
577 	ki->kaio_active_count = 0;
578 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
579 	ki->kaio_count = 0;
580 	ki->kaio_ballowed_count = max_buf_aio;
581 	ki->kaio_buffer_count = 0;
582 	TAILQ_INIT(&ki->kaio_all);
583 	TAILQ_INIT(&ki->kaio_done);
584 	TAILQ_INIT(&ki->kaio_jobqueue);
585 	TAILQ_INIT(&ki->kaio_bufqueue);
586 	TAILQ_INIT(&ki->kaio_liojoblist);
587 	TAILQ_INIT(&ki->kaio_syncqueue);
588 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
589 	PROC_LOCK(p);
590 	if (p->p_aioinfo == NULL) {
591 		p->p_aioinfo = ki;
592 		PROC_UNLOCK(p);
593 	} else {
594 		PROC_UNLOCK(p);
595 		mtx_destroy(&ki->kaio_mtx);
596 		uma_zfree(kaio_zone, ki);
597 	}
598 
599 	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
600 		aio_newproc(NULL);
601 }
602 
603 static int
604 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
605 {
606 	struct thread *td;
607 	int error;
608 
609 	error = sigev_findtd(p, sigev, &td);
610 	if (error)
611 		return (error);
612 	if (!KSI_ONQ(ksi)) {
613 		ksiginfo_set_sigev(ksi, sigev);
614 		ksi->ksi_code = SI_ASYNCIO;
615 		ksi->ksi_flags |= KSI_EXT | KSI_INS;
616 		tdsendsignal(p, td, ksi->ksi_signo, ksi);
617 	}
618 	PROC_UNLOCK(p);
619 	return (error);
620 }
621 
622 /*
623  * Free a job entry.  Wait for completion if it is currently active, but don't
624  * delay forever.  If we delay, we return a flag that says that we have to
625  * restart the queue scan.
626  */
627 static int
628 aio_free_entry(struct aiocblist *aiocbe)
629 {
630 	struct kaioinfo *ki;
631 	struct aioliojob *lj;
632 	struct proc *p;
633 
634 	p = aiocbe->userproc;
635 	MPASS(curproc == p);
636 	ki = p->p_aioinfo;
637 	MPASS(ki != NULL);
638 
639 	AIO_LOCK_ASSERT(ki, MA_OWNED);
640 	MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
641 
642 	atomic_subtract_int(&num_queue_count, 1);
643 
644 	ki->kaio_count--;
645 	MPASS(ki->kaio_count >= 0);
646 
647 	TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
648 	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
649 
650 	lj = aiocbe->lio;
651 	if (lj) {
652 		lj->lioj_count--;
653 		lj->lioj_finished_count--;
654 
655 		if (lj->lioj_count == 0) {
656 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
657 			/* lio is going away, we need to destroy any knotes */
658 			knlist_delete(&lj->klist, curthread, 1);
659 			PROC_LOCK(p);
660 			sigqueue_take(&lj->lioj_ksi);
661 			PROC_UNLOCK(p);
662 			uma_zfree(aiolio_zone, lj);
663 		}
664 	}
665 
666 	/* aiocbe is going away, we need to destroy any knotes */
667 	knlist_delete(&aiocbe->klist, curthread, 1);
668 	PROC_LOCK(p);
669 	sigqueue_take(&aiocbe->ksi);
670 	PROC_UNLOCK(p);
671 
672 	MPASS(aiocbe->bp == NULL);
673 	aiocbe->jobstate = JOBST_NULL;
674 	AIO_UNLOCK(ki);
675 
676 	/*
677 	 * The thread argument here is used to find the owning process
678 	 * and is also passed to fo_close() which may pass it to various
679 	 * places such as devsw close() routines.  Because of that, we
680 	 * need a thread pointer from the process owning the job that is
681 	 * persistent and won't disappear out from under us or move to
682 	 * another process.
683 	 *
684 	 * Currently, all the callers of this function call it to remove
685 	 * an aiocblist from the current process' job list either via a
686 	 * syscall or due to the current process calling exit() or
687 	 * execve().  Thus, we know that p == curproc.  We also know that
688 	 * curthread can't exit since we are curthread.
689 	 *
690 	 * Therefore, we use curthread as the thread to pass to
691 	 * knlist_delete().  This does mean that it is possible for the
692 	 * thread pointer at close time to differ from the thread pointer
693 	 * at open time, but this is already true of file descriptors in
694 	 * a multithreaded process.
695 	 */
696 	if (aiocbe->fd_file)
697 		fdrop(aiocbe->fd_file, curthread);
698 	crfree(aiocbe->cred);
699 	uma_zfree(aiocb_zone, aiocbe);
700 	AIO_LOCK(ki);
701 
702 	return (0);
703 }
704 
705 static void
706 aio_proc_rundown_exec(void *arg, struct proc *p,
707     struct image_params *imgp __unused)
708 {
709    	aio_proc_rundown(arg, p);
710 }
711 
712 /*
713  * Rundown the jobs for a given process.
714  */
715 static void
716 aio_proc_rundown(void *arg, struct proc *p)
717 {
718 	struct kaioinfo *ki;
719 	struct aioliojob *lj;
720 	struct aiocblist *cbe, *cbn;
721 	struct file *fp;
722 	struct socket *so;
723 	int remove;
724 
725 	KASSERT(curthread->td_proc == p,
726 	    ("%s: called on non-curproc", __func__));
727 	ki = p->p_aioinfo;
728 	if (ki == NULL)
729 		return;
730 
731 	AIO_LOCK(ki);
732 	ki->kaio_flags |= KAIO_RUNDOWN;
733 
734 restart:
735 
736 	/*
737 	 * Try to cancel all pending requests. This code simulates
738 	 * aio_cancel on all pending I/O requests.
739 	 */
740 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
741 		remove = 0;
742 		mtx_lock(&aio_job_mtx);
743 		if (cbe->jobstate == JOBST_JOBQGLOBAL) {
744 			TAILQ_REMOVE(&aio_jobs, cbe, list);
745 			remove = 1;
746 		} else if (cbe->jobstate == JOBST_JOBQSOCK) {
747 			fp = cbe->fd_file;
748 			MPASS(fp->f_type == DTYPE_SOCKET);
749 			so = fp->f_data;
750 			TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
751 			remove = 1;
752 		} else if (cbe->jobstate == JOBST_JOBQSYNC) {
753 			TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
754 			remove = 1;
755 		}
756 		mtx_unlock(&aio_job_mtx);
757 
758 		if (remove) {
759 			cbe->jobstate = JOBST_JOBFINISHED;
760 			cbe->uaiocb._aiocb_private.status = -1;
761 			cbe->uaiocb._aiocb_private.error = ECANCELED;
762 			TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
763 			aio_bio_done_notify(p, cbe, DONE_QUEUE);
764 		}
765 	}
766 
767 	/* Wait for all running I/O to be finished */
768 	if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
769 	    TAILQ_FIRST(&ki->kaio_jobqueue)) {
770 		ki->kaio_flags |= KAIO_WAKEUP;
771 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
772 		goto restart;
773 	}
774 
775 	/* Free all completed I/O requests. */
776 	while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
777 		aio_free_entry(cbe);
778 
779 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
780 		if (lj->lioj_count == 0) {
781 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
782 			knlist_delete(&lj->klist, curthread, 1);
783 			PROC_LOCK(p);
784 			sigqueue_take(&lj->lioj_ksi);
785 			PROC_UNLOCK(p);
786 			uma_zfree(aiolio_zone, lj);
787 		} else {
788 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
789 			    lj->lioj_count, lj->lioj_finished_count);
790 		}
791 	}
792 	AIO_UNLOCK(ki);
793 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
794 	mtx_destroy(&ki->kaio_mtx);
795 	uma_zfree(kaio_zone, ki);
796 	p->p_aioinfo = NULL;
797 }
798 
799 /*
800  * Select a job to run (called by an AIO daemon).
801  */
802 static struct aiocblist *
803 aio_selectjob(struct aioproc *aiop)
804 {
805 	struct aiocblist *aiocbe;
806 	struct kaioinfo *ki;
807 	struct proc *userp;
808 
809 	mtx_assert(&aio_job_mtx, MA_OWNED);
810 	TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
811 		userp = aiocbe->userproc;
812 		ki = userp->p_aioinfo;
813 
814 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
815 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
816 			/* Account for currently active jobs. */
817 			ki->kaio_active_count++;
818 			aiocbe->jobstate = JOBST_JOBRUNNING;
819 			break;
820 		}
821 	}
822 	return (aiocbe);
823 }
824 
825 /*
826  * Move all data to a permanent storage device.  This code
827  * simulates the fsync syscall.
828  */
829 static int
830 aio_fsync_vnode(struct thread *td, struct vnode *vp)
831 {
832 	struct mount *mp;
833 	int error;
834 
835 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
836 		goto drop;
837 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
838 	if (vp->v_object != NULL) {
839 		VM_OBJECT_WLOCK(vp->v_object);
840 		vm_object_page_clean(vp->v_object, 0, 0, 0);
841 		VM_OBJECT_WUNLOCK(vp->v_object);
842 	}
843 	error = VOP_FSYNC(vp, MNT_WAIT, td);
844 
845 	VOP_UNLOCK(vp, 0);
846 	vn_finished_write(mp);
847 drop:
848 	return (error);
849 }
850 
851 /*
852  * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
853  * does the I/O request for the non-physio version of the operations.  The
854  * normal vn operations are used, and this code should work in all instances
855  * for every type of file, including pipes, sockets, fifos, and regular files.
856  *
857  * XXX I don't think it works well for socket, pipe, and fifo.
858  */
859 static void
860 aio_process_rw(struct aiocblist *aiocbe)
861 {
862 	struct ucred *td_savedcred;
863 	struct thread *td;
864 	struct aiocb *cb;
865 	struct file *fp;
866 	struct socket *so;
867 	struct uio auio;
868 	struct iovec aiov;
869 	int cnt;
870 	int error;
871 	int oublock_st, oublock_end;
872 	int inblock_st, inblock_end;
873 
874 	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_READ ||
875 	    aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE,
876 	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
877 
878 	td = curthread;
879 	td_savedcred = td->td_ucred;
880 	td->td_ucred = aiocbe->cred;
881 	cb = &aiocbe->uaiocb;
882 	fp = aiocbe->fd_file;
883 
884 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
885 	aiov.iov_len = cb->aio_nbytes;
886 
887 	auio.uio_iov = &aiov;
888 	auio.uio_iovcnt = 1;
889 	auio.uio_offset = cb->aio_offset;
890 	auio.uio_resid = cb->aio_nbytes;
891 	cnt = cb->aio_nbytes;
892 	auio.uio_segflg = UIO_USERSPACE;
893 	auio.uio_td = td;
894 
895 	inblock_st = td->td_ru.ru_inblock;
896 	oublock_st = td->td_ru.ru_oublock;
897 	/*
898 	 * aio_aqueue() acquires a reference to the file that is
899 	 * released in aio_free_entry().
900 	 */
901 	if (cb->aio_lio_opcode == LIO_READ) {
902 		auio.uio_rw = UIO_READ;
903 		if (auio.uio_resid == 0)
904 			error = 0;
905 		else
906 			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
907 	} else {
908 		if (fp->f_type == DTYPE_VNODE)
909 			bwillwrite();
910 		auio.uio_rw = UIO_WRITE;
911 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
912 	}
913 	inblock_end = td->td_ru.ru_inblock;
914 	oublock_end = td->td_ru.ru_oublock;
915 
916 	aiocbe->inputcharge = inblock_end - inblock_st;
917 	aiocbe->outputcharge = oublock_end - oublock_st;
918 
919 	if ((error) && (auio.uio_resid != cnt)) {
920 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
921 			error = 0;
922 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
923 			int sigpipe = 1;
924 			if (fp->f_type == DTYPE_SOCKET) {
925 				so = fp->f_data;
926 				if (so->so_options & SO_NOSIGPIPE)
927 					sigpipe = 0;
928 			}
929 			if (sigpipe) {
930 				PROC_LOCK(aiocbe->userproc);
931 				kern_psignal(aiocbe->userproc, SIGPIPE);
932 				PROC_UNLOCK(aiocbe->userproc);
933 			}
934 		}
935 	}
936 
937 	cnt -= auio.uio_resid;
938 	cb->_aiocb_private.error = error;
939 	cb->_aiocb_private.status = cnt;
940 	td->td_ucred = td_savedcred;
941 }
942 
943 static void
944 aio_process_sync(struct aiocblist *aiocbe)
945 {
946 	struct thread *td = curthread;
947 	struct ucred *td_savedcred = td->td_ucred;
948 	struct aiocb *cb = &aiocbe->uaiocb;
949 	struct file *fp = aiocbe->fd_file;
950 	int error = 0;
951 
952 	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_SYNC,
953 	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
954 
955 	td->td_ucred = aiocbe->cred;
956 	if (fp->f_vnode != NULL)
957 		error = aio_fsync_vnode(td, fp->f_vnode);
958 	cb->_aiocb_private.error = error;
959 	cb->_aiocb_private.status = 0;
960 	td->td_ucred = td_savedcred;
961 }
962 
963 static void
964 aio_process_mlock(struct aiocblist *aiocbe)
965 {
966 	struct aiocb *cb = &aiocbe->uaiocb;
967 	int error;
968 
969 	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_MLOCK,
970 	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
971 
972 	error = vm_mlock(aiocbe->userproc, aiocbe->cred,
973 	    __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
974 	cb->_aiocb_private.error = error;
975 	cb->_aiocb_private.status = 0;
976 }
977 
978 static void
979 aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
980 {
981 	struct aioliojob *lj;
982 	struct kaioinfo *ki;
983 	struct aiocblist *scb, *scbn;
984 	int lj_done;
985 
986 	ki = userp->p_aioinfo;
987 	AIO_LOCK_ASSERT(ki, MA_OWNED);
988 	lj = aiocbe->lio;
989 	lj_done = 0;
990 	if (lj) {
991 		lj->lioj_finished_count++;
992 		if (lj->lioj_count == lj->lioj_finished_count)
993 			lj_done = 1;
994 	}
995 	if (type == DONE_QUEUE) {
996 		aiocbe->jobflags |= AIOCBLIST_DONE;
997 	} else {
998 		aiocbe->jobflags |= AIOCBLIST_BUFDONE;
999 	}
1000 	TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
1001 	aiocbe->jobstate = JOBST_JOBFINISHED;
1002 
1003 	if (ki->kaio_flags & KAIO_RUNDOWN)
1004 		goto notification_done;
1005 
1006 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1007 	    aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
1008 		aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
1009 
1010 	KNOTE_LOCKED(&aiocbe->klist, 1);
1011 
1012 	if (lj_done) {
1013 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
1014 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
1015 			KNOTE_LOCKED(&lj->klist, 1);
1016 		}
1017 		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
1018 		    == LIOJ_SIGNAL
1019 		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
1020 		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
1021 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
1022 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
1023 		}
1024 	}
1025 
1026 notification_done:
1027 	if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
1028 		TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
1029 			if (aiocbe->fd_file == scb->fd_file &&
1030 			    aiocbe->seqno < scb->seqno) {
1031 				if (--scb->pending == 0) {
1032 					mtx_lock(&aio_job_mtx);
1033 					scb->jobstate = JOBST_JOBQGLOBAL;
1034 					TAILQ_REMOVE(&ki->kaio_syncqueue, scb,
1035 					    list);
1036 					TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
1037 					aio_kick_nowait(userp);
1038 					mtx_unlock(&aio_job_mtx);
1039 				}
1040 			}
1041 		}
1042 	}
1043 	if (ki->kaio_flags & KAIO_WAKEUP) {
1044 		ki->kaio_flags &= ~KAIO_WAKEUP;
1045 		wakeup(&userp->p_aioinfo);
1046 	}
1047 }
1048 
1049 static void
1050 aio_switch_vmspace(struct aiocblist *aiocbe)
1051 {
1052 
1053 	vmspace_switch_aio(aiocbe->userproc->p_vmspace);
1054 }
1055 
1056 /*
1057  * The AIO daemon, most of the actual work is done in aio_process_*,
1058  * but the setup (and address space mgmt) is done in this routine.
1059  */
1060 static void
1061 aio_daemon(void *_id)
1062 {
1063 	struct aiocblist *aiocbe;
1064 	struct aioproc *aiop;
1065 	struct kaioinfo *ki;
1066 	struct proc *p, *userp;
1067 	struct vmspace *myvm;
1068 	struct thread *td = curthread;
1069 	int id = (intptr_t)_id;
1070 
1071 	/*
1072 	 * Grab an extra reference on the daemon's vmspace so that it
1073 	 * doesn't get freed by jobs that switch to a different
1074 	 * vmspace.
1075 	 */
1076 	p = td->td_proc;
1077 	myvm = vmspace_acquire_ref(p);
1078 
1079 	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
1080 
1081 	/*
1082 	 * Allocate and ready the aio control info.  There is one aiop structure
1083 	 * per daemon.
1084 	 */
1085 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
1086 	aiop->aioproc = p;
1087 	aiop->aioprocflags = 0;
1088 
1089 	/*
1090 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
1091 	 * and creating too many daemons.)
1092 	 */
1093 	sema_post(&aio_newproc_sem);
1094 
1095 	mtx_lock(&aio_job_mtx);
1096 	for (;;) {
1097 		/*
1098 		 * Take daemon off of free queue
1099 		 */
1100 		if (aiop->aioprocflags & AIOP_FREE) {
1101 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1102 			aiop->aioprocflags &= ~AIOP_FREE;
1103 		}
1104 
1105 		/*
1106 		 * Check for jobs.
1107 		 */
1108 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
1109 			mtx_unlock(&aio_job_mtx);
1110 			userp = aiocbe->userproc;
1111 
1112 			/*
1113 			 * Connect to process address space for user program.
1114 			 */
1115 			aio_switch_vmspace(aiocbe);
1116 
1117 			ki = userp->p_aioinfo;
1118 
1119 			/* Do the I/O function. */
1120 			switch(aiocbe->uaiocb.aio_lio_opcode) {
1121 			case LIO_READ:
1122 			case LIO_WRITE:
1123 				aio_process_rw(aiocbe);
1124 				break;
1125 			case LIO_SYNC:
1126 				aio_process_sync(aiocbe);
1127 				break;
1128 			case LIO_MLOCK:
1129 				aio_process_mlock(aiocbe);
1130 				break;
1131 			}
1132 
1133 			mtx_lock(&aio_job_mtx);
1134 			/* Decrement the active job count. */
1135 			ki->kaio_active_count--;
1136 			mtx_unlock(&aio_job_mtx);
1137 
1138 			AIO_LOCK(ki);
1139 			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
1140 			aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
1141 			AIO_UNLOCK(ki);
1142 
1143 			mtx_lock(&aio_job_mtx);
1144 		}
1145 
1146 		/*
1147 		 * Disconnect from user address space.
1148 		 */
1149 		if (p->p_vmspace != myvm) {
1150 			mtx_unlock(&aio_job_mtx);
1151 			vmspace_switch_aio(myvm);
1152 			mtx_lock(&aio_job_mtx);
1153 			/*
1154 			 * We have to restart to avoid race, we only sleep if
1155 			 * no job can be selected.
1156 			 */
1157 			continue;
1158 		}
1159 
1160 		mtx_assert(&aio_job_mtx, MA_OWNED);
1161 
1162 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
1163 		aiop->aioprocflags |= AIOP_FREE;
1164 
1165 		/*
1166 		 * If daemon is inactive for a long time, allow it to exit,
1167 		 * thereby freeing resources.
1168 		 */
1169 		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
1170 		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
1171 		    (aiop->aioprocflags & AIOP_FREE) &&
1172 		    num_aio_procs > target_aio_procs)
1173 			break;
1174 	}
1175 	TAILQ_REMOVE(&aio_freeproc, aiop, list);
1176 	num_aio_procs--;
1177 	mtx_unlock(&aio_job_mtx);
1178 	uma_zfree(aiop_zone, aiop);
1179 	free_unr(aiod_unr, id);
1180 	vmspace_free(myvm);
1181 
1182 	KASSERT(p->p_vmspace == myvm,
1183 	    ("AIOD: bad vmspace for exiting daemon"));
1184 	KASSERT(myvm->vm_refcnt > 1,
1185 	    ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
1186 	kproc_exit(0);
1187 }
1188 
1189 /*
1190  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
1191  * AIO daemon modifies its environment itself.
1192  */
1193 static int
1194 aio_newproc(int *start)
1195 {
1196 	int error;
1197 	struct proc *p;
1198 	int id;
1199 
1200 	id = alloc_unr(aiod_unr);
1201 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
1202 		RFNOWAIT, 0, "aiod%d", id);
1203 	if (error == 0) {
1204 		/*
1205 		 * Wait until daemon is started.
1206 		 */
1207 		sema_wait(&aio_newproc_sem);
1208 		mtx_lock(&aio_job_mtx);
1209 		num_aio_procs++;
1210 		if (start != NULL)
1211 			(*start)--;
1212 		mtx_unlock(&aio_job_mtx);
1213 	} else {
1214 		free_unr(aiod_unr, id);
1215 	}
1216 	return (error);
1217 }
1218 
1219 /*
1220  * Try the high-performance, low-overhead physio method for eligible
1221  * VCHR devices.  This method doesn't use an aio helper thread, and
1222  * thus has very low overhead.
1223  *
1224  * Assumes that the caller, aio_aqueue(), has incremented the file
1225  * structure's reference count, preventing its deallocation for the
1226  * duration of this call.
1227  */
1228 static int
1229 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1230 {
1231 	struct aiocb *cb;
1232 	struct file *fp;
1233 	struct bio *bp;
1234 	struct buf *pbuf;
1235 	struct vnode *vp;
1236 	struct cdevsw *csw;
1237 	struct cdev *dev;
1238 	struct kaioinfo *ki;
1239 	struct aioliojob *lj;
1240 	int error, ref, unmap, poff;
1241 	vm_prot_t prot;
1242 
1243 	cb = &aiocbe->uaiocb;
1244 	fp = aiocbe->fd_file;
1245 
1246 	if (fp == NULL || fp->f_type != DTYPE_VNODE)
1247 		return (-1);
1248 
1249 	vp = fp->f_vnode;
1250 	if (vp->v_type != VCHR)
1251 		return (-1);
1252 	if (vp->v_bufobj.bo_bsize == 0)
1253 		return (-1);
1254 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
1255 		return (-1);
1256 
1257 	ref = 0;
1258 	csw = devvn_refthread(vp, &dev, &ref);
1259 	if (csw == NULL)
1260 		return (ENXIO);
1261 
1262 	if ((csw->d_flags & D_DISK) == 0) {
1263 		error = -1;
1264 		goto unref;
1265 	}
1266 	if (cb->aio_nbytes > dev->si_iosize_max) {
1267 		error = -1;
1268 		goto unref;
1269 	}
1270 
1271 	ki = p->p_aioinfo;
1272 	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
1273 	unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed);
1274 	if (unmap) {
1275 		if (cb->aio_nbytes > MAXPHYS) {
1276 			error = -1;
1277 			goto unref;
1278 		}
1279 	} else {
1280 		if (cb->aio_nbytes > MAXPHYS - poff) {
1281 			error = -1;
1282 			goto unref;
1283 		}
1284 		if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
1285 			error = -1;
1286 			goto unref;
1287 		}
1288 	}
1289 	aiocbe->bp = bp = g_alloc_bio();
1290 	if (!unmap) {
1291 		aiocbe->pbuf = pbuf = (struct buf *)getpbuf(NULL);
1292 		BUF_KERNPROC(pbuf);
1293 	}
1294 
1295 	AIO_LOCK(ki);
1296 	ki->kaio_count++;
1297 	if (!unmap)
1298 		ki->kaio_buffer_count++;
1299 	lj = aiocbe->lio;
1300 	if (lj)
1301 		lj->lioj_count++;
1302 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1303 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1304 	aiocbe->jobstate = JOBST_JOBQBUF;
1305 	cb->_aiocb_private.status = cb->aio_nbytes;
1306 	AIO_UNLOCK(ki);
1307 
1308 	bp->bio_length = cb->aio_nbytes;
1309 	bp->bio_bcount = cb->aio_nbytes;
1310 	bp->bio_done = aio_physwakeup;
1311 	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
1312 	bp->bio_offset = cb->aio_offset;
1313 	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
1314 	bp->bio_dev = dev;
1315 	bp->bio_caller1 = (void *)aiocbe;
1316 
1317 	prot = VM_PROT_READ;
1318 	if (cb->aio_lio_opcode == LIO_READ)
1319 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
1320 	if ((aiocbe->npages = vm_fault_quick_hold_pages(
1321 	    &curproc->p_vmspace->vm_map,
1322 	    (vm_offset_t)bp->bio_data, bp->bio_length, prot, aiocbe->pages,
1323 	    sizeof(aiocbe->pages)/sizeof(aiocbe->pages[0]))) < 0) {
1324 		error = EFAULT;
1325 		goto doerror;
1326 	}
1327 	if (!unmap) {
1328 		pmap_qenter((vm_offset_t)pbuf->b_data,
1329 		    aiocbe->pages, aiocbe->npages);
1330 		bp->bio_data = pbuf->b_data + poff;
1331 	} else {
1332 		bp->bio_ma = aiocbe->pages;
1333 		bp->bio_ma_n = aiocbe->npages;
1334 		bp->bio_ma_offset = poff;
1335 		bp->bio_data = unmapped_buf;
1336 		bp->bio_flags |= BIO_UNMAPPED;
1337 	}
1338 
1339 	atomic_add_int(&num_queue_count, 1);
1340 	if (!unmap)
1341 		atomic_add_int(&num_buf_aio, 1);
1342 
1343 	/* Perform transfer. */
1344 	csw->d_strategy(bp);
1345 	dev_relthread(dev, ref);
1346 	return (0);
1347 
1348 doerror:
1349 	AIO_LOCK(ki);
1350 	aiocbe->jobstate = JOBST_NULL;
1351 	TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1352 	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
1353 	ki->kaio_count--;
1354 	if (!unmap)
1355 		ki->kaio_buffer_count--;
1356 	if (lj)
1357 		lj->lioj_count--;
1358 	AIO_UNLOCK(ki);
1359 	if (pbuf) {
1360 		relpbuf(pbuf, NULL);
1361 		aiocbe->pbuf = NULL;
1362 	}
1363 	g_destroy_bio(bp);
1364 	aiocbe->bp = NULL;
1365 unref:
1366 	dev_relthread(dev, ref);
1367 	return (error);
1368 }
1369 
1370 /*
1371  * Wake up aio requests that may be serviceable now.
1372  */
1373 static void
1374 aio_swake_cb(struct socket *so, struct sockbuf *sb)
1375 {
1376 	struct aiocblist *cb, *cbn;
1377 	int opcode;
1378 
1379 	SOCKBUF_LOCK_ASSERT(sb);
1380 	if (sb == &so->so_snd)
1381 		opcode = LIO_WRITE;
1382 	else
1383 		opcode = LIO_READ;
1384 
1385 	sb->sb_flags &= ~SB_AIO;
1386 	mtx_lock(&aio_job_mtx);
1387 	TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
1388 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1389 			if (cb->jobstate != JOBST_JOBQSOCK)
1390 				panic("invalid queue value");
1391 			/* XXX
1392 			 * We don't have actual sockets backend yet,
1393 			 * so we simply move the requests to the generic
1394 			 * file I/O backend.
1395 			 */
1396 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1397 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1398 			aio_kick_nowait(cb->userproc);
1399 		}
1400 	}
1401 	mtx_unlock(&aio_job_mtx);
1402 }
1403 
1404 static int
1405 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
1406 {
1407 
1408 	/*
1409 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
1410 	 * supported by AIO with the old sigevent structure.
1411 	 */
1412 	nsig->sigev_notify = osig->sigev_notify;
1413 	switch (nsig->sigev_notify) {
1414 	case SIGEV_NONE:
1415 		break;
1416 	case SIGEV_SIGNAL:
1417 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
1418 		break;
1419 	case SIGEV_KEVENT:
1420 		nsig->sigev_notify_kqueue =
1421 		    osig->__sigev_u.__sigev_notify_kqueue;
1422 		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
1423 		break;
1424 	default:
1425 		return (EINVAL);
1426 	}
1427 	return (0);
1428 }
1429 
1430 static int
1431 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
1432 {
1433 	struct oaiocb *ojob;
1434 	int error;
1435 
1436 	bzero(kjob, sizeof(struct aiocb));
1437 	error = copyin(ujob, kjob, sizeof(struct oaiocb));
1438 	if (error)
1439 		return (error);
1440 	ojob = (struct oaiocb *)kjob;
1441 	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
1442 }
1443 
1444 static int
1445 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
1446 {
1447 
1448 	return (copyin(ujob, kjob, sizeof(struct aiocb)));
1449 }
1450 
1451 static long
1452 aiocb_fetch_status(struct aiocb *ujob)
1453 {
1454 
1455 	return (fuword(&ujob->_aiocb_private.status));
1456 }
1457 
1458 static long
1459 aiocb_fetch_error(struct aiocb *ujob)
1460 {
1461 
1462 	return (fuword(&ujob->_aiocb_private.error));
1463 }
1464 
1465 static int
1466 aiocb_store_status(struct aiocb *ujob, long status)
1467 {
1468 
1469 	return (suword(&ujob->_aiocb_private.status, status));
1470 }
1471 
1472 static int
1473 aiocb_store_error(struct aiocb *ujob, long error)
1474 {
1475 
1476 	return (suword(&ujob->_aiocb_private.error, error));
1477 }
1478 
1479 static int
1480 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
1481 {
1482 
1483 	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
1484 }
1485 
1486 static int
1487 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
1488 {
1489 
1490 	return (suword(ujobp, (long)ujob));
1491 }
1492 
1493 static struct aiocb_ops aiocb_ops = {
1494 	.copyin = aiocb_copyin,
1495 	.fetch_status = aiocb_fetch_status,
1496 	.fetch_error = aiocb_fetch_error,
1497 	.store_status = aiocb_store_status,
1498 	.store_error = aiocb_store_error,
1499 	.store_kernelinfo = aiocb_store_kernelinfo,
1500 	.store_aiocb = aiocb_store_aiocb,
1501 };
1502 
1503 static struct aiocb_ops aiocb_ops_osigevent = {
1504 	.copyin = aiocb_copyin_old_sigevent,
1505 	.fetch_status = aiocb_fetch_status,
1506 	.fetch_error = aiocb_fetch_error,
1507 	.store_status = aiocb_store_status,
1508 	.store_error = aiocb_store_error,
1509 	.store_kernelinfo = aiocb_store_kernelinfo,
1510 	.store_aiocb = aiocb_store_aiocb,
1511 };
1512 
1513 /*
1514  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1515  * technique is done in this code.
1516  */
1517 int
1518 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
1519 	int type, struct aiocb_ops *ops)
1520 {
1521 	struct proc *p = td->td_proc;
1522 	cap_rights_t rights;
1523 	struct file *fp;
1524 	struct socket *so;
1525 	struct aiocblist *aiocbe, *cb;
1526 	struct kaioinfo *ki;
1527 	struct kevent kev;
1528 	struct sockbuf *sb;
1529 	int opcode;
1530 	int error;
1531 	int fd, kqfd;
1532 	int jid;
1533 	u_short evflags;
1534 
1535 	if (p->p_aioinfo == NULL)
1536 		aio_init_aioinfo(p);
1537 
1538 	ki = p->p_aioinfo;
1539 
1540 	ops->store_status(job, -1);
1541 	ops->store_error(job, 0);
1542 	ops->store_kernelinfo(job, -1);
1543 
1544 	if (num_queue_count >= max_queue_count ||
1545 	    ki->kaio_count >= ki->kaio_qallowed_count) {
1546 		ops->store_error(job, EAGAIN);
1547 		return (EAGAIN);
1548 	}
1549 
1550 	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
1551 	knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
1552 
1553 	error = ops->copyin(job, &aiocbe->uaiocb);
1554 	if (error) {
1555 		ops->store_error(job, error);
1556 		uma_zfree(aiocb_zone, aiocbe);
1557 		return (error);
1558 	}
1559 
1560 	/* XXX: aio_nbytes is later casted to signed types. */
1561 	if (aiocbe->uaiocb.aio_nbytes > INT_MAX) {
1562 		uma_zfree(aiocb_zone, aiocbe);
1563 		return (EINVAL);
1564 	}
1565 
1566 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
1567 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
1568 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
1569 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
1570 		ops->store_error(job, EINVAL);
1571 		uma_zfree(aiocb_zone, aiocbe);
1572 		return (EINVAL);
1573 	}
1574 
1575 	if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1576 	     aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
1577 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1578 		uma_zfree(aiocb_zone, aiocbe);
1579 		return (EINVAL);
1580 	}
1581 
1582 	ksiginfo_init(&aiocbe->ksi);
1583 
1584 	/* Save userspace address of the job info. */
1585 	aiocbe->uuaiocb = job;
1586 
1587 	/* Get the opcode. */
1588 	if (type != LIO_NOP)
1589 		aiocbe->uaiocb.aio_lio_opcode = type;
1590 	opcode = aiocbe->uaiocb.aio_lio_opcode;
1591 
1592 	/*
1593 	 * Validate the opcode and fetch the file object for the specified
1594 	 * file descriptor.
1595 	 *
1596 	 * XXXRW: Moved the opcode validation up here so that we don't
1597 	 * retrieve a file descriptor without knowing what the capabiltity
1598 	 * should be.
1599 	 */
1600 	fd = aiocbe->uaiocb.aio_fildes;
1601 	switch (opcode) {
1602 	case LIO_WRITE:
1603 		error = fget_write(td, fd,
1604 		    cap_rights_init(&rights, CAP_PWRITE), &fp);
1605 		break;
1606 	case LIO_READ:
1607 		error = fget_read(td, fd,
1608 		    cap_rights_init(&rights, CAP_PREAD), &fp);
1609 		break;
1610 	case LIO_SYNC:
1611 		error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
1612 		break;
1613 	case LIO_MLOCK:
1614 		fp = NULL;
1615 		break;
1616 	case LIO_NOP:
1617 		error = fget(td, fd, cap_rights_init(&rights), &fp);
1618 		break;
1619 	default:
1620 		error = EINVAL;
1621 	}
1622 	if (error) {
1623 		uma_zfree(aiocb_zone, aiocbe);
1624 		ops->store_error(job, error);
1625 		return (error);
1626 	}
1627 
1628 	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
1629 		error = EINVAL;
1630 		goto aqueue_fail;
1631 	}
1632 
1633 	if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
1634 		error = EINVAL;
1635 		goto aqueue_fail;
1636 	}
1637 
1638 	aiocbe->fd_file = fp;
1639 
1640 	mtx_lock(&aio_job_mtx);
1641 	jid = jobrefid++;
1642 	aiocbe->seqno = jobseqno++;
1643 	mtx_unlock(&aio_job_mtx);
1644 	error = ops->store_kernelinfo(job, jid);
1645 	if (error) {
1646 		error = EINVAL;
1647 		goto aqueue_fail;
1648 	}
1649 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
1650 
1651 	if (opcode == LIO_NOP) {
1652 		fdrop(fp, td);
1653 		uma_zfree(aiocb_zone, aiocbe);
1654 		return (0);
1655 	}
1656 
1657 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
1658 		goto no_kqueue;
1659 	evflags = aiocbe->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
1660 	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
1661 		error = EINVAL;
1662 		goto aqueue_fail;
1663 	}
1664 	kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1665 	kev.ident = (uintptr_t)aiocbe->uuaiocb;
1666 	kev.filter = EVFILT_AIO;
1667 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
1668 	kev.data = (intptr_t)aiocbe;
1669 	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
1670 	error = kqfd_register(kqfd, &kev, td, 1);
1671 aqueue_fail:
1672 	if (error) {
1673 		if (fp)
1674 			fdrop(fp, td);
1675 		uma_zfree(aiocb_zone, aiocbe);
1676 		ops->store_error(job, error);
1677 		goto done;
1678 	}
1679 no_kqueue:
1680 
1681 	ops->store_error(job, EINPROGRESS);
1682 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1683 	aiocbe->userproc = p;
1684 	aiocbe->cred = crhold(td->td_ucred);
1685 	aiocbe->jobflags = 0;
1686 	aiocbe->lio = lj;
1687 
1688 	if (opcode == LIO_SYNC)
1689 		goto queueit;
1690 
1691 	if (fp && fp->f_type == DTYPE_SOCKET) {
1692 		/*
1693 		 * Alternate queueing for socket ops: Reach down into the
1694 		 * descriptor to get the socket data.  Then check to see if the
1695 		 * socket is ready to be read or written (based on the requested
1696 		 * operation).
1697 		 *
1698 		 * If it is not ready for io, then queue the aiocbe on the
1699 		 * socket, and set the flags so we get a call when sbnotify()
1700 		 * happens.
1701 		 *
1702 		 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock
1703 		 * and unlock the snd sockbuf for no reason.
1704 		 */
1705 		so = fp->f_data;
1706 		sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
1707 		SOCKBUF_LOCK(sb);
1708 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1709 		    LIO_WRITE) && (!sowriteable(so)))) {
1710 			sb->sb_flags |= SB_AIO;
1711 
1712 			mtx_lock(&aio_job_mtx);
1713 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1714 			mtx_unlock(&aio_job_mtx);
1715 
1716 			AIO_LOCK(ki);
1717 			TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1718 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1719 			aiocbe->jobstate = JOBST_JOBQSOCK;
1720 			ki->kaio_count++;
1721 			if (lj)
1722 				lj->lioj_count++;
1723 			AIO_UNLOCK(ki);
1724 			SOCKBUF_UNLOCK(sb);
1725 			atomic_add_int(&num_queue_count, 1);
1726 			error = 0;
1727 			goto done;
1728 		}
1729 		SOCKBUF_UNLOCK(sb);
1730 	}
1731 
1732 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1733 		goto done;
1734 #if 0
1735 	if (error > 0) {
1736 		aiocbe->uaiocb._aiocb_private.error = error;
1737 		ops->store_error(job, error);
1738 		goto done;
1739 	}
1740 #endif
1741 queueit:
1742 	atomic_add_int(&num_queue_count, 1);
1743 
1744 	AIO_LOCK(ki);
1745 	ki->kaio_count++;
1746 	if (lj)
1747 		lj->lioj_count++;
1748 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1749 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1750 	if (opcode == LIO_SYNC) {
1751 		TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
1752 			if (cb->fd_file == aiocbe->fd_file &&
1753 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
1754 			    cb->seqno < aiocbe->seqno) {
1755 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
1756 				aiocbe->pending++;
1757 			}
1758 		}
1759 		TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
1760 			if (cb->fd_file == aiocbe->fd_file &&
1761 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
1762 			    cb->seqno < aiocbe->seqno) {
1763 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
1764 				aiocbe->pending++;
1765 			}
1766 		}
1767 		if (aiocbe->pending != 0) {
1768 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
1769 			aiocbe->jobstate = JOBST_JOBQSYNC;
1770 			AIO_UNLOCK(ki);
1771 			goto done;
1772 		}
1773 	}
1774 	mtx_lock(&aio_job_mtx);
1775 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1776 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1777 	aio_kick_nowait(p);
1778 	mtx_unlock(&aio_job_mtx);
1779 	AIO_UNLOCK(ki);
1780 	error = 0;
1781 done:
1782 	return (error);
1783 }
1784 
1785 static void
1786 aio_kick_nowait(struct proc *userp)
1787 {
1788 	struct kaioinfo *ki = userp->p_aioinfo;
1789 	struct aioproc *aiop;
1790 
1791 	mtx_assert(&aio_job_mtx, MA_OWNED);
1792 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1793 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1794 		aiop->aioprocflags &= ~AIOP_FREE;
1795 		wakeup(aiop->aioproc);
1796 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
1797 	    ki->kaio_active_count + num_aio_resv_start <
1798 	    ki->kaio_maxactive_count) {
1799 		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
1800 	}
1801 }
1802 
1803 static int
1804 aio_kick(struct proc *userp)
1805 {
1806 	struct kaioinfo *ki = userp->p_aioinfo;
1807 	struct aioproc *aiop;
1808 	int error, ret = 0;
1809 
1810 	mtx_assert(&aio_job_mtx, MA_OWNED);
1811 retryproc:
1812 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1813 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1814 		aiop->aioprocflags &= ~AIOP_FREE;
1815 		wakeup(aiop->aioproc);
1816 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
1817 	    ki->kaio_active_count + num_aio_resv_start <
1818 	    ki->kaio_maxactive_count) {
1819 		num_aio_resv_start++;
1820 		mtx_unlock(&aio_job_mtx);
1821 		error = aio_newproc(&num_aio_resv_start);
1822 		mtx_lock(&aio_job_mtx);
1823 		if (error) {
1824 			num_aio_resv_start--;
1825 			goto retryproc;
1826 		}
1827 	} else {
1828 		ret = -1;
1829 	}
1830 	return (ret);
1831 }
1832 
1833 static void
1834 aio_kick_helper(void *context, int pending)
1835 {
1836 	struct proc *userp = context;
1837 
1838 	mtx_lock(&aio_job_mtx);
1839 	while (--pending >= 0) {
1840 		if (aio_kick(userp))
1841 			break;
1842 	}
1843 	mtx_unlock(&aio_job_mtx);
1844 }
1845 
1846 /*
1847  * Support the aio_return system call, as a side-effect, kernel resources are
1848  * released.
1849  */
1850 static int
1851 kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops)
1852 {
1853 	struct proc *p = td->td_proc;
1854 	struct aiocblist *cb;
1855 	struct kaioinfo *ki;
1856 	int status, error;
1857 
1858 	ki = p->p_aioinfo;
1859 	if (ki == NULL)
1860 		return (EINVAL);
1861 	AIO_LOCK(ki);
1862 	TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
1863 		if (cb->uuaiocb == uaiocb)
1864 			break;
1865 	}
1866 	if (cb != NULL) {
1867 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
1868 		status = cb->uaiocb._aiocb_private.status;
1869 		error = cb->uaiocb._aiocb_private.error;
1870 		td->td_retval[0] = status;
1871 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1872 			td->td_ru.ru_oublock += cb->outputcharge;
1873 			cb->outputcharge = 0;
1874 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1875 			td->td_ru.ru_inblock += cb->inputcharge;
1876 			cb->inputcharge = 0;
1877 		}
1878 		aio_free_entry(cb);
1879 		AIO_UNLOCK(ki);
1880 		ops->store_error(uaiocb, error);
1881 		ops->store_status(uaiocb, status);
1882 	} else {
1883 		error = EINVAL;
1884 		AIO_UNLOCK(ki);
1885 	}
1886 	return (error);
1887 }
1888 
1889 int
1890 sys_aio_return(struct thread *td, struct aio_return_args *uap)
1891 {
1892 
1893 	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
1894 }
1895 
1896 /*
1897  * Allow a process to wakeup when any of the I/O requests are completed.
1898  */
1899 static int
1900 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
1901     struct timespec *ts)
1902 {
1903 	struct proc *p = td->td_proc;
1904 	struct timeval atv;
1905 	struct kaioinfo *ki;
1906 	struct aiocblist *cb, *cbfirst;
1907 	int error, i, timo;
1908 
1909 	timo = 0;
1910 	if (ts) {
1911 		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
1912 			return (EINVAL);
1913 
1914 		TIMESPEC_TO_TIMEVAL(&atv, ts);
1915 		if (itimerfix(&atv))
1916 			return (EINVAL);
1917 		timo = tvtohz(&atv);
1918 	}
1919 
1920 	ki = p->p_aioinfo;
1921 	if (ki == NULL)
1922 		return (EAGAIN);
1923 
1924 	if (njoblist == 0)
1925 		return (0);
1926 
1927 	AIO_LOCK(ki);
1928 	for (;;) {
1929 		cbfirst = NULL;
1930 		error = 0;
1931 		TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
1932 			for (i = 0; i < njoblist; i++) {
1933 				if (cb->uuaiocb == ujoblist[i]) {
1934 					if (cbfirst == NULL)
1935 						cbfirst = cb;
1936 					if (cb->jobstate == JOBST_JOBFINISHED)
1937 						goto RETURN;
1938 				}
1939 			}
1940 		}
1941 		/* All tasks were finished. */
1942 		if (cbfirst == NULL)
1943 			break;
1944 
1945 		ki->kaio_flags |= KAIO_WAKEUP;
1946 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
1947 		    "aiospn", timo);
1948 		if (error == ERESTART)
1949 			error = EINTR;
1950 		if (error)
1951 			break;
1952 	}
1953 RETURN:
1954 	AIO_UNLOCK(ki);
1955 	return (error);
1956 }
1957 
1958 int
1959 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1960 {
1961 	struct timespec ts, *tsp;
1962 	struct aiocb **ujoblist;
1963 	int error;
1964 
1965 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
1966 		return (EINVAL);
1967 
1968 	if (uap->timeout) {
1969 		/* Get timespec struct. */
1970 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1971 			return (error);
1972 		tsp = &ts;
1973 	} else
1974 		tsp = NULL;
1975 
1976 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
1977 	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
1978 	if (error == 0)
1979 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
1980 	uma_zfree(aiol_zone, ujoblist);
1981 	return (error);
1982 }
1983 
1984 /*
1985  * aio_cancel cancels any non-physio aio operations not currently in
1986  * progress.
1987  */
1988 int
1989 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1990 {
1991 	struct proc *p = td->td_proc;
1992 	struct kaioinfo *ki;
1993 	struct aiocblist *cbe, *cbn;
1994 	struct file *fp;
1995 	struct socket *so;
1996 	cap_rights_t rights;
1997 	int error;
1998 	int remove;
1999 	int cancelled = 0;
2000 	int notcancelled = 0;
2001 	struct vnode *vp;
2002 
2003 	/* Lookup file object. */
2004 	error = fget(td, uap->fd, cap_rights_init(&rights), &fp);
2005 	if (error)
2006 		return (error);
2007 
2008 	ki = p->p_aioinfo;
2009 	if (ki == NULL)
2010 		goto done;
2011 
2012 	if (fp->f_type == DTYPE_VNODE) {
2013 		vp = fp->f_vnode;
2014 		if (vn_isdisk(vp, &error)) {
2015 			fdrop(fp, td);
2016 			td->td_retval[0] = AIO_NOTCANCELED;
2017 			return (0);
2018 		}
2019 	}
2020 
2021 	AIO_LOCK(ki);
2022 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
2023 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
2024 		    ((uap->aiocbp == NULL) ||
2025 		     (uap->aiocbp == cbe->uuaiocb))) {
2026 			remove = 0;
2027 
2028 			mtx_lock(&aio_job_mtx);
2029 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
2030 				TAILQ_REMOVE(&aio_jobs, cbe, list);
2031 				remove = 1;
2032 			} else if (cbe->jobstate == JOBST_JOBQSOCK) {
2033 				MPASS(fp->f_type == DTYPE_SOCKET);
2034 				so = fp->f_data;
2035 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
2036 				remove = 1;
2037 			} else if (cbe->jobstate == JOBST_JOBQSYNC) {
2038 				TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
2039 				remove = 1;
2040 			}
2041 			mtx_unlock(&aio_job_mtx);
2042 
2043 			if (remove) {
2044 				TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
2045 				cbe->uaiocb._aiocb_private.status = -1;
2046 				cbe->uaiocb._aiocb_private.error = ECANCELED;
2047 				aio_bio_done_notify(p, cbe, DONE_QUEUE);
2048 				cancelled++;
2049 			} else {
2050 				notcancelled++;
2051 			}
2052 			if (uap->aiocbp != NULL)
2053 				break;
2054 		}
2055 	}
2056 	AIO_UNLOCK(ki);
2057 
2058 done:
2059 	fdrop(fp, td);
2060 
2061 	if (uap->aiocbp != NULL) {
2062 		if (cancelled) {
2063 			td->td_retval[0] = AIO_CANCELED;
2064 			return (0);
2065 		}
2066 	}
2067 
2068 	if (notcancelled) {
2069 		td->td_retval[0] = AIO_NOTCANCELED;
2070 		return (0);
2071 	}
2072 
2073 	if (cancelled) {
2074 		td->td_retval[0] = AIO_CANCELED;
2075 		return (0);
2076 	}
2077 
2078 	td->td_retval[0] = AIO_ALLDONE;
2079 
2080 	return (0);
2081 }
2082 
2083 /*
2084  * aio_error is implemented in the kernel level for compatibility purposes
2085  * only.  For a user mode async implementation, it would be best to do it in
2086  * a userland subroutine.
2087  */
2088 static int
2089 kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops)
2090 {
2091 	struct proc *p = td->td_proc;
2092 	struct aiocblist *cb;
2093 	struct kaioinfo *ki;
2094 	int status;
2095 
2096 	ki = p->p_aioinfo;
2097 	if (ki == NULL) {
2098 		td->td_retval[0] = EINVAL;
2099 		return (0);
2100 	}
2101 
2102 	AIO_LOCK(ki);
2103 	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
2104 		if (cb->uuaiocb == aiocbp) {
2105 			if (cb->jobstate == JOBST_JOBFINISHED)
2106 				td->td_retval[0] =
2107 					cb->uaiocb._aiocb_private.error;
2108 			else
2109 				td->td_retval[0] = EINPROGRESS;
2110 			AIO_UNLOCK(ki);
2111 			return (0);
2112 		}
2113 	}
2114 	AIO_UNLOCK(ki);
2115 
2116 	/*
2117 	 * Hack for failure of aio_aqueue.
2118 	 */
2119 	status = ops->fetch_status(aiocbp);
2120 	if (status == -1) {
2121 		td->td_retval[0] = ops->fetch_error(aiocbp);
2122 		return (0);
2123 	}
2124 
2125 	td->td_retval[0] = EINVAL;
2126 	return (0);
2127 }
2128 
2129 int
2130 sys_aio_error(struct thread *td, struct aio_error_args *uap)
2131 {
2132 
2133 	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
2134 }
2135 
2136 /* syscall - asynchronous read from a file (REALTIME) */
2137 int
2138 sys_oaio_read(struct thread *td, struct oaio_read_args *uap)
2139 {
2140 
2141 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2142 	    &aiocb_ops_osigevent));
2143 }
2144 
2145 int
2146 sys_aio_read(struct thread *td, struct aio_read_args *uap)
2147 {
2148 
2149 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
2150 }
2151 
2152 /* syscall - asynchronous write to a file (REALTIME) */
2153 int
2154 sys_oaio_write(struct thread *td, struct oaio_write_args *uap)
2155 {
2156 
2157 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2158 	    &aiocb_ops_osigevent));
2159 }
2160 
2161 int
2162 sys_aio_write(struct thread *td, struct aio_write_args *uap)
2163 {
2164 
2165 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
2166 }
2167 
2168 int
2169 sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
2170 {
2171 
2172 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
2173 }
2174 
2175 static int
2176 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
2177     struct aiocb **acb_list, int nent, struct sigevent *sig,
2178     struct aiocb_ops *ops)
2179 {
2180 	struct proc *p = td->td_proc;
2181 	struct aiocb *iocb;
2182 	struct kaioinfo *ki;
2183 	struct aioliojob *lj;
2184 	struct kevent kev;
2185 	int error;
2186 	int nerror;
2187 	int i;
2188 
2189 	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
2190 		return (EINVAL);
2191 
2192 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2193 		return (EINVAL);
2194 
2195 	if (p->p_aioinfo == NULL)
2196 		aio_init_aioinfo(p);
2197 
2198 	ki = p->p_aioinfo;
2199 
2200 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
2201 	lj->lioj_flags = 0;
2202 	lj->lioj_count = 0;
2203 	lj->lioj_finished_count = 0;
2204 	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
2205 	ksiginfo_init(&lj->lioj_ksi);
2206 
2207 	/*
2208 	 * Setup signal.
2209 	 */
2210 	if (sig && (mode == LIO_NOWAIT)) {
2211 		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
2212 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2213 			/* Assume only new style KEVENT */
2214 			kev.filter = EVFILT_LIO;
2215 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
2216 			kev.ident = (uintptr_t)uacb_list; /* something unique */
2217 			kev.data = (intptr_t)lj;
2218 			/* pass user defined sigval data */
2219 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
2220 			error = kqfd_register(
2221 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
2222 			if (error) {
2223 				uma_zfree(aiolio_zone, lj);
2224 				return (error);
2225 			}
2226 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
2227 			;
2228 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2229 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
2230 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2231 					uma_zfree(aiolio_zone, lj);
2232 					return EINVAL;
2233 				}
2234 				lj->lioj_flags |= LIOJ_SIGNAL;
2235 		} else {
2236 			uma_zfree(aiolio_zone, lj);
2237 			return EINVAL;
2238 		}
2239 	}
2240 
2241 	AIO_LOCK(ki);
2242 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2243 	/*
2244 	 * Add extra aiocb count to avoid the lio to be freed
2245 	 * by other threads doing aio_waitcomplete or aio_return,
2246 	 * and prevent event from being sent until we have queued
2247 	 * all tasks.
2248 	 */
2249 	lj->lioj_count = 1;
2250 	AIO_UNLOCK(ki);
2251 
2252 	/*
2253 	 * Get pointers to the list of I/O requests.
2254 	 */
2255 	nerror = 0;
2256 	for (i = 0; i < nent; i++) {
2257 		iocb = acb_list[i];
2258 		if (iocb != NULL) {
2259 			error = aio_aqueue(td, iocb, lj, LIO_NOP, ops);
2260 			if (error != 0)
2261 				nerror++;
2262 		}
2263 	}
2264 
2265 	error = 0;
2266 	AIO_LOCK(ki);
2267 	if (mode == LIO_WAIT) {
2268 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
2269 			ki->kaio_flags |= KAIO_WAKEUP;
2270 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
2271 			    PRIBIO | PCATCH, "aiospn", 0);
2272 			if (error == ERESTART)
2273 				error = EINTR;
2274 			if (error)
2275 				break;
2276 		}
2277 	} else {
2278 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
2279 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2280 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
2281 				KNOTE_LOCKED(&lj->klist, 1);
2282 			}
2283 			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
2284 			    == LIOJ_SIGNAL
2285 			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2286 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
2287 				aio_sendsig(p, &lj->lioj_signal,
2288 					    &lj->lioj_ksi);
2289 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2290 			}
2291 		}
2292 	}
2293 	lj->lioj_count--;
2294 	if (lj->lioj_count == 0) {
2295 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
2296 		knlist_delete(&lj->klist, curthread, 1);
2297 		PROC_LOCK(p);
2298 		sigqueue_take(&lj->lioj_ksi);
2299 		PROC_UNLOCK(p);
2300 		AIO_UNLOCK(ki);
2301 		uma_zfree(aiolio_zone, lj);
2302 	} else
2303 		AIO_UNLOCK(ki);
2304 
2305 	if (nerror)
2306 		return (EIO);
2307 	return (error);
2308 }
2309 
2310 /* syscall - list directed I/O (REALTIME) */
2311 int
2312 sys_olio_listio(struct thread *td, struct olio_listio_args *uap)
2313 {
2314 	struct aiocb **acb_list;
2315 	struct sigevent *sigp, sig;
2316 	struct osigevent osig;
2317 	int error, nent;
2318 
2319 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2320 		return (EINVAL);
2321 
2322 	nent = uap->nent;
2323 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2324 		return (EINVAL);
2325 
2326 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2327 		error = copyin(uap->sig, &osig, sizeof(osig));
2328 		if (error)
2329 			return (error);
2330 		error = convert_old_sigevent(&osig, &sig);
2331 		if (error)
2332 			return (error);
2333 		sigp = &sig;
2334 	} else
2335 		sigp = NULL;
2336 
2337 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2338 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2339 	if (error == 0)
2340 		error = kern_lio_listio(td, uap->mode,
2341 		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2342 		    &aiocb_ops_osigevent);
2343 	free(acb_list, M_LIO);
2344 	return (error);
2345 }
2346 
2347 /* syscall - list directed I/O (REALTIME) */
2348 int
2349 sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
2350 {
2351 	struct aiocb **acb_list;
2352 	struct sigevent *sigp, sig;
2353 	int error, nent;
2354 
2355 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2356 		return (EINVAL);
2357 
2358 	nent = uap->nent;
2359 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2360 		return (EINVAL);
2361 
2362 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2363 		error = copyin(uap->sig, &sig, sizeof(sig));
2364 		if (error)
2365 			return (error);
2366 		sigp = &sig;
2367 	} else
2368 		sigp = NULL;
2369 
2370 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2371 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2372 	if (error == 0)
2373 		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
2374 		    nent, sigp, &aiocb_ops);
2375 	free(acb_list, M_LIO);
2376 	return (error);
2377 }
2378 
2379 static void
2380 aio_physwakeup(struct bio *bp)
2381 {
2382 	struct aiocblist *aiocbe = (struct aiocblist *)bp->bio_caller1;
2383 	struct proc *userp;
2384 	struct kaioinfo *ki;
2385 	int nblks;
2386 
2387 	/* Release mapping into kernel space. */
2388 	if (aiocbe->pbuf) {
2389 		pmap_qremove((vm_offset_t)aiocbe->pbuf->b_data, aiocbe->npages);
2390 		relpbuf(aiocbe->pbuf, NULL);
2391 		aiocbe->pbuf = NULL;
2392 		atomic_subtract_int(&num_buf_aio, 1);
2393 	}
2394 	vm_page_unhold_pages(aiocbe->pages, aiocbe->npages);
2395 
2396 	bp = aiocbe->bp;
2397 	aiocbe->bp = NULL;
2398 	userp = aiocbe->userproc;
2399 	ki = userp->p_aioinfo;
2400 	AIO_LOCK(ki);
2401 	aiocbe->uaiocb._aiocb_private.status -= bp->bio_resid;
2402 	aiocbe->uaiocb._aiocb_private.error = 0;
2403 	if (bp->bio_flags & BIO_ERROR)
2404 		aiocbe->uaiocb._aiocb_private.error = bp->bio_error;
2405 	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
2406 	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
2407 		aiocbe->outputcharge += nblks;
2408 	else
2409 		aiocbe->inputcharge += nblks;
2410 	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
2411 	ki->kaio_buffer_count--;
2412 	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
2413 	AIO_UNLOCK(ki);
2414 
2415 	g_destroy_bio(bp);
2416 }
2417 
2418 /* syscall - wait for the next completion of an aio request */
2419 static int
2420 kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp,
2421     struct timespec *ts, struct aiocb_ops *ops)
2422 {
2423 	struct proc *p = td->td_proc;
2424 	struct timeval atv;
2425 	struct kaioinfo *ki;
2426 	struct aiocblist *cb;
2427 	struct aiocb *uuaiocb;
2428 	int error, status, timo;
2429 
2430 	ops->store_aiocb(aiocbp, NULL);
2431 
2432 	if (ts == NULL) {
2433 		timo = 0;
2434 	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
2435 		timo = -1;
2436 	} else {
2437 		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
2438 			return (EINVAL);
2439 
2440 		TIMESPEC_TO_TIMEVAL(&atv, ts);
2441 		if (itimerfix(&atv))
2442 			return (EINVAL);
2443 		timo = tvtohz(&atv);
2444 	}
2445 
2446 	if (p->p_aioinfo == NULL)
2447 		aio_init_aioinfo(p);
2448 	ki = p->p_aioinfo;
2449 
2450 	error = 0;
2451 	cb = NULL;
2452 	AIO_LOCK(ki);
2453 	while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
2454 		if (timo == -1) {
2455 			error = EWOULDBLOCK;
2456 			break;
2457 		}
2458 		ki->kaio_flags |= KAIO_WAKEUP;
2459 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2460 		    "aiowc", timo);
2461 		if (timo && error == ERESTART)
2462 			error = EINTR;
2463 		if (error)
2464 			break;
2465 	}
2466 
2467 	if (cb != NULL) {
2468 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
2469 		uuaiocb = cb->uuaiocb;
2470 		status = cb->uaiocb._aiocb_private.status;
2471 		error = cb->uaiocb._aiocb_private.error;
2472 		td->td_retval[0] = status;
2473 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2474 			td->td_ru.ru_oublock += cb->outputcharge;
2475 			cb->outputcharge = 0;
2476 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2477 			td->td_ru.ru_inblock += cb->inputcharge;
2478 			cb->inputcharge = 0;
2479 		}
2480 		aio_free_entry(cb);
2481 		AIO_UNLOCK(ki);
2482 		ops->store_aiocb(aiocbp, uuaiocb);
2483 		ops->store_error(uuaiocb, error);
2484 		ops->store_status(uuaiocb, status);
2485 	} else
2486 		AIO_UNLOCK(ki);
2487 
2488 	return (error);
2489 }
2490 
2491 int
2492 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2493 {
2494 	struct timespec ts, *tsp;
2495 	int error;
2496 
2497 	if (uap->timeout) {
2498 		/* Get timespec struct. */
2499 		error = copyin(uap->timeout, &ts, sizeof(ts));
2500 		if (error)
2501 			return (error);
2502 		tsp = &ts;
2503 	} else
2504 		tsp = NULL;
2505 
2506 	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
2507 }
2508 
2509 static int
2510 kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
2511     struct aiocb_ops *ops)
2512 {
2513 	struct proc *p = td->td_proc;
2514 	struct kaioinfo *ki;
2515 
2516 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
2517 		return (EINVAL);
2518 	ki = p->p_aioinfo;
2519 	if (ki == NULL)
2520 		aio_init_aioinfo(p);
2521 	return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
2522 }
2523 
2524 int
2525 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
2526 {
2527 
2528 	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
2529 }
2530 
2531 /* kqueue attach function */
2532 static int
2533 filt_aioattach(struct knote *kn)
2534 {
2535 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2536 
2537 	/*
2538 	 * The aiocbe pointer must be validated before using it, so
2539 	 * registration is restricted to the kernel; the user cannot
2540 	 * set EV_FLAG1.
2541 	 */
2542 	if ((kn->kn_flags & EV_FLAG1) == 0)
2543 		return (EPERM);
2544 	kn->kn_ptr.p_aio = aiocbe;
2545 	kn->kn_flags &= ~EV_FLAG1;
2546 
2547 	knlist_add(&aiocbe->klist, kn, 0);
2548 
2549 	return (0);
2550 }
2551 
2552 /* kqueue detach function */
2553 static void
2554 filt_aiodetach(struct knote *kn)
2555 {
2556 	struct knlist *knl;
2557 
2558 	knl = &kn->kn_ptr.p_aio->klist;
2559 	knl->kl_lock(knl->kl_lockarg);
2560 	if (!knlist_empty(knl))
2561 		knlist_remove(knl, kn, 1);
2562 	knl->kl_unlock(knl->kl_lockarg);
2563 }
2564 
2565 /* kqueue filter function */
2566 /*ARGSUSED*/
2567 static int
2568 filt_aio(struct knote *kn, long hint)
2569 {
2570 	struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
2571 
2572 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2573 	if (aiocbe->jobstate != JOBST_JOBFINISHED)
2574 		return (0);
2575 	kn->kn_flags |= EV_EOF;
2576 	return (1);
2577 }
2578 
2579 /* kqueue attach function */
2580 static int
2581 filt_lioattach(struct knote *kn)
2582 {
2583 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
2584 
2585 	/*
2586 	 * The aioliojob pointer must be validated before using it, so
2587 	 * registration is restricted to the kernel; the user cannot
2588 	 * set EV_FLAG1.
2589 	 */
2590 	if ((kn->kn_flags & EV_FLAG1) == 0)
2591 		return (EPERM);
2592 	kn->kn_ptr.p_lio = lj;
2593 	kn->kn_flags &= ~EV_FLAG1;
2594 
2595 	knlist_add(&lj->klist, kn, 0);
2596 
2597 	return (0);
2598 }
2599 
2600 /* kqueue detach function */
2601 static void
2602 filt_liodetach(struct knote *kn)
2603 {
2604 	struct knlist *knl;
2605 
2606 	knl = &kn->kn_ptr.p_lio->klist;
2607 	knl->kl_lock(knl->kl_lockarg);
2608 	if (!knlist_empty(knl))
2609 		knlist_remove(knl, kn, 1);
2610 	knl->kl_unlock(knl->kl_lockarg);
2611 }
2612 
2613 /* kqueue filter function */
2614 /*ARGSUSED*/
2615 static int
2616 filt_lio(struct knote *kn, long hint)
2617 {
2618 	struct aioliojob * lj = kn->kn_ptr.p_lio;
2619 
2620 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
2621 }
2622 
2623 #ifdef COMPAT_FREEBSD32
2624 
2625 struct __aiocb_private32 {
2626 	int32_t	status;
2627 	int32_t	error;
2628 	uint32_t kernelinfo;
2629 };
2630 
2631 typedef struct oaiocb32 {
2632 	int	aio_fildes;		/* File descriptor */
2633 	uint64_t aio_offset __packed;	/* File offset for I/O */
2634 	uint32_t aio_buf;		/* I/O buffer in process space */
2635 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2636 	struct	osigevent32 aio_sigevent; /* Signal to deliver */
2637 	int	aio_lio_opcode;		/* LIO opcode */
2638 	int	aio_reqprio;		/* Request priority -- ignored */
2639 	struct	__aiocb_private32 _aiocb_private;
2640 } oaiocb32_t;
2641 
2642 typedef struct aiocb32 {
2643 	int32_t	aio_fildes;		/* File descriptor */
2644 	uint64_t aio_offset __packed;	/* File offset for I/O */
2645 	uint32_t aio_buf;		/* I/O buffer in process space */
2646 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2647 	int	__spare__[2];
2648 	uint32_t __spare2__;
2649 	int	aio_lio_opcode;		/* LIO opcode */
2650 	int	aio_reqprio;		/* Request priority -- ignored */
2651 	struct	__aiocb_private32 _aiocb_private;
2652 	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
2653 } aiocb32_t;
2654 
2655 static int
2656 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
2657 {
2658 
2659 	/*
2660 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
2661 	 * supported by AIO with the old sigevent structure.
2662 	 */
2663 	CP(*osig, *nsig, sigev_notify);
2664 	switch (nsig->sigev_notify) {
2665 	case SIGEV_NONE:
2666 		break;
2667 	case SIGEV_SIGNAL:
2668 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
2669 		break;
2670 	case SIGEV_KEVENT:
2671 		nsig->sigev_notify_kqueue =
2672 		    osig->__sigev_u.__sigev_notify_kqueue;
2673 		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
2674 		break;
2675 	default:
2676 		return (EINVAL);
2677 	}
2678 	return (0);
2679 }
2680 
2681 static int
2682 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
2683 {
2684 	struct oaiocb32 job32;
2685 	int error;
2686 
2687 	bzero(kjob, sizeof(struct aiocb));
2688 	error = copyin(ujob, &job32, sizeof(job32));
2689 	if (error)
2690 		return (error);
2691 
2692 	CP(job32, *kjob, aio_fildes);
2693 	CP(job32, *kjob, aio_offset);
2694 	PTRIN_CP(job32, *kjob, aio_buf);
2695 	CP(job32, *kjob, aio_nbytes);
2696 	CP(job32, *kjob, aio_lio_opcode);
2697 	CP(job32, *kjob, aio_reqprio);
2698 	CP(job32, *kjob, _aiocb_private.status);
2699 	CP(job32, *kjob, _aiocb_private.error);
2700 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2701 	return (convert_old_sigevent32(&job32.aio_sigevent,
2702 	    &kjob->aio_sigevent));
2703 }
2704 
2705 static int
2706 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
2707 {
2708 	struct aiocb32 job32;
2709 	int error;
2710 
2711 	error = copyin(ujob, &job32, sizeof(job32));
2712 	if (error)
2713 		return (error);
2714 	CP(job32, *kjob, aio_fildes);
2715 	CP(job32, *kjob, aio_offset);
2716 	PTRIN_CP(job32, *kjob, aio_buf);
2717 	CP(job32, *kjob, aio_nbytes);
2718 	CP(job32, *kjob, aio_lio_opcode);
2719 	CP(job32, *kjob, aio_reqprio);
2720 	CP(job32, *kjob, _aiocb_private.status);
2721 	CP(job32, *kjob, _aiocb_private.error);
2722 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2723 	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
2724 }
2725 
2726 static long
2727 aiocb32_fetch_status(struct aiocb *ujob)
2728 {
2729 	struct aiocb32 *ujob32;
2730 
2731 	ujob32 = (struct aiocb32 *)ujob;
2732 	return (fuword32(&ujob32->_aiocb_private.status));
2733 }
2734 
2735 static long
2736 aiocb32_fetch_error(struct aiocb *ujob)
2737 {
2738 	struct aiocb32 *ujob32;
2739 
2740 	ujob32 = (struct aiocb32 *)ujob;
2741 	return (fuword32(&ujob32->_aiocb_private.error));
2742 }
2743 
2744 static int
2745 aiocb32_store_status(struct aiocb *ujob, long status)
2746 {
2747 	struct aiocb32 *ujob32;
2748 
2749 	ujob32 = (struct aiocb32 *)ujob;
2750 	return (suword32(&ujob32->_aiocb_private.status, status));
2751 }
2752 
2753 static int
2754 aiocb32_store_error(struct aiocb *ujob, long error)
2755 {
2756 	struct aiocb32 *ujob32;
2757 
2758 	ujob32 = (struct aiocb32 *)ujob;
2759 	return (suword32(&ujob32->_aiocb_private.error, error));
2760 }
2761 
2762 static int
2763 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
2764 {
2765 	struct aiocb32 *ujob32;
2766 
2767 	ujob32 = (struct aiocb32 *)ujob;
2768 	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
2769 }
2770 
2771 static int
2772 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
2773 {
2774 
2775 	return (suword32(ujobp, (long)ujob));
2776 }
2777 
2778 static struct aiocb_ops aiocb32_ops = {
2779 	.copyin = aiocb32_copyin,
2780 	.fetch_status = aiocb32_fetch_status,
2781 	.fetch_error = aiocb32_fetch_error,
2782 	.store_status = aiocb32_store_status,
2783 	.store_error = aiocb32_store_error,
2784 	.store_kernelinfo = aiocb32_store_kernelinfo,
2785 	.store_aiocb = aiocb32_store_aiocb,
2786 };
2787 
2788 static struct aiocb_ops aiocb32_ops_osigevent = {
2789 	.copyin = aiocb32_copyin_old_sigevent,
2790 	.fetch_status = aiocb32_fetch_status,
2791 	.fetch_error = aiocb32_fetch_error,
2792 	.store_status = aiocb32_store_status,
2793 	.store_error = aiocb32_store_error,
2794 	.store_kernelinfo = aiocb32_store_kernelinfo,
2795 	.store_aiocb = aiocb32_store_aiocb,
2796 };
2797 
2798 int
2799 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
2800 {
2801 
2802 	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2803 }
2804 
2805 int
2806 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
2807 {
2808 	struct timespec32 ts32;
2809 	struct timespec ts, *tsp;
2810 	struct aiocb **ujoblist;
2811 	uint32_t *ujoblist32;
2812 	int error, i;
2813 
2814 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
2815 		return (EINVAL);
2816 
2817 	if (uap->timeout) {
2818 		/* Get timespec struct. */
2819 		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
2820 			return (error);
2821 		CP(ts32, ts, tv_sec);
2822 		CP(ts32, ts, tv_nsec);
2823 		tsp = &ts;
2824 	} else
2825 		tsp = NULL;
2826 
2827 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
2828 	ujoblist32 = (uint32_t *)ujoblist;
2829 	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
2830 	    sizeof(ujoblist32[0]));
2831 	if (error == 0) {
2832 		for (i = uap->nent; i > 0; i--)
2833 			ujoblist[i] = PTRIN(ujoblist32[i]);
2834 
2835 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2836 	}
2837 	uma_zfree(aiol_zone, ujoblist);
2838 	return (error);
2839 }
2840 
2841 int
2842 freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap)
2843 {
2844 
2845 	return (sys_aio_cancel(td, (struct aio_cancel_args *)uap));
2846 }
2847 
2848 int
2849 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
2850 {
2851 
2852 	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2853 }
2854 
2855 int
2856 freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap)
2857 {
2858 
2859 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2860 	    &aiocb32_ops_osigevent));
2861 }
2862 
2863 int
2864 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
2865 {
2866 
2867 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2868 	    &aiocb32_ops));
2869 }
2870 
2871 int
2872 freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap)
2873 {
2874 
2875 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2876 	    &aiocb32_ops_osigevent));
2877 }
2878 
2879 int
2880 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
2881 {
2882 
2883 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2884 	    &aiocb32_ops));
2885 }
2886 
2887 int
2888 freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
2889 {
2890 
2891 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
2892 	    &aiocb32_ops));
2893 }
2894 
2895 int
2896 freebsd32_aio_waitcomplete(struct thread *td,
2897     struct freebsd32_aio_waitcomplete_args *uap)
2898 {
2899 	struct timespec32 ts32;
2900 	struct timespec ts, *tsp;
2901 	int error;
2902 
2903 	if (uap->timeout) {
2904 		/* Get timespec struct. */
2905 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
2906 		if (error)
2907 			return (error);
2908 		CP(ts32, ts, tv_sec);
2909 		CP(ts32, ts, tv_nsec);
2910 		tsp = &ts;
2911 	} else
2912 		tsp = NULL;
2913 
2914 	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
2915 	    &aiocb32_ops));
2916 }
2917 
2918 int
2919 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
2920 {
2921 
2922 	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
2923 	    &aiocb32_ops));
2924 }
2925 
2926 int
2927 freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap)
2928 {
2929 	struct aiocb **acb_list;
2930 	struct sigevent *sigp, sig;
2931 	struct osigevent32 osig;
2932 	uint32_t *acb_list32;
2933 	int error, i, nent;
2934 
2935 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2936 		return (EINVAL);
2937 
2938 	nent = uap->nent;
2939 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2940 		return (EINVAL);
2941 
2942 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2943 		error = copyin(uap->sig, &osig, sizeof(osig));
2944 		if (error)
2945 			return (error);
2946 		error = convert_old_sigevent32(&osig, &sig);
2947 		if (error)
2948 			return (error);
2949 		sigp = &sig;
2950 	} else
2951 		sigp = NULL;
2952 
2953 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
2954 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
2955 	if (error) {
2956 		free(acb_list32, M_LIO);
2957 		return (error);
2958 	}
2959 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2960 	for (i = 0; i < nent; i++)
2961 		acb_list[i] = PTRIN(acb_list32[i]);
2962 	free(acb_list32, M_LIO);
2963 
2964 	error = kern_lio_listio(td, uap->mode,
2965 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2966 	    &aiocb32_ops_osigevent);
2967 	free(acb_list, M_LIO);
2968 	return (error);
2969 }
2970 
2971 int
2972 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
2973 {
2974 	struct aiocb **acb_list;
2975 	struct sigevent *sigp, sig;
2976 	struct sigevent32 sig32;
2977 	uint32_t *acb_list32;
2978 	int error, i, nent;
2979 
2980 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2981 		return (EINVAL);
2982 
2983 	nent = uap->nent;
2984 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2985 		return (EINVAL);
2986 
2987 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2988 		error = copyin(uap->sig, &sig32, sizeof(sig32));
2989 		if (error)
2990 			return (error);
2991 		error = convert_sigevent32(&sig32, &sig);
2992 		if (error)
2993 			return (error);
2994 		sigp = &sig;
2995 	} else
2996 		sigp = NULL;
2997 
2998 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
2999 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
3000 	if (error) {
3001 		free(acb_list32, M_LIO);
3002 		return (error);
3003 	}
3004 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
3005 	for (i = 0; i < nent; i++)
3006 		acb_list[i] = PTRIN(acb_list32[i]);
3007 	free(acb_list32, M_LIO);
3008 
3009 	error = kern_lio_listio(td, uap->mode,
3010 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
3011 	    &aiocb32_ops);
3012 	free(acb_list, M_LIO);
3013 	return (error);
3014 }
3015 
3016 #endif
3017