xref: /freebsd/sys/kern/vfs_aio.c (revision 4f0a4502a1f33fef287ac558c98e5ef99a32216f)
1 /*-
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  */
16 
17 /*
18  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
19  */
20 
21 #include <sys/cdefs.h>
22 __FBSDID("$FreeBSD$");
23 
24 #include "opt_compat.h"
25 
26 #include <sys/param.h>
27 #include <sys/systm.h>
28 #include <sys/malloc.h>
29 #include <sys/bio.h>
30 #include <sys/buf.h>
31 #include <sys/capsicum.h>
32 #include <sys/eventhandler.h>
33 #include <sys/sysproto.h>
34 #include <sys/filedesc.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/kthread.h>
38 #include <sys/fcntl.h>
39 #include <sys/file.h>
40 #include <sys/limits.h>
41 #include <sys/lock.h>
42 #include <sys/mutex.h>
43 #include <sys/unistd.h>
44 #include <sys/posix4.h>
45 #include <sys/proc.h>
46 #include <sys/resourcevar.h>
47 #include <sys/signalvar.h>
48 #include <sys/protosw.h>
49 #include <sys/rwlock.h>
50 #include <sys/sema.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <sys/syscall.h>
54 #include <sys/sysent.h>
55 #include <sys/sysctl.h>
56 #include <sys/sx.h>
57 #include <sys/taskqueue.h>
58 #include <sys/vnode.h>
59 #include <sys/conf.h>
60 #include <sys/event.h>
61 #include <sys/mount.h>
62 #include <geom/geom.h>
63 
64 #include <machine/atomic.h>
65 
66 #include <vm/vm.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_extern.h>
69 #include <vm/pmap.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_object.h>
72 #include <vm/uma.h>
73 #include <sys/aio.h>
74 
75 #include "opt_vfs_aio.h"
76 
77 /*
78  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
79  * overflow. (XXX will be removed soon.)
80  */
81 static u_long jobrefid;
82 
83 /*
84  * Counter for aio_fsync.
85  */
86 static uint64_t jobseqno;
87 
88 #define JOBST_NULL		0
89 #define JOBST_JOBQSOCK		1
90 #define JOBST_JOBQGLOBAL	2
91 #define JOBST_JOBRUNNING	3
92 #define JOBST_JOBFINISHED	4
93 #define JOBST_JOBQBUF		5
94 #define JOBST_JOBQSYNC		6
95 
96 #ifndef MAX_AIO_PER_PROC
97 #define MAX_AIO_PER_PROC	32
98 #endif
99 
100 #ifndef MAX_AIO_QUEUE_PER_PROC
101 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
102 #endif
103 
104 #ifndef MAX_AIO_PROCS
105 #define MAX_AIO_PROCS		32
106 #endif
107 
108 #ifndef MAX_AIO_QUEUE
109 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
110 #endif
111 
112 #ifndef TARGET_AIO_PROCS
113 #define TARGET_AIO_PROCS	4
114 #endif
115 
116 #ifndef MAX_BUF_AIO
117 #define MAX_BUF_AIO		16
118 #endif
119 
120 #ifndef AIOD_LIFETIME_DEFAULT
121 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
122 #endif
123 
124 FEATURE(aio, "Asynchronous I/O");
125 
126 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
127 
128 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
129 
130 static int max_aio_procs = MAX_AIO_PROCS;
131 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
132 	CTLFLAG_RW, &max_aio_procs, 0,
133 	"Maximum number of kernel processes to use for handling async IO ");
134 
135 static int num_aio_procs = 0;
136 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
137 	CTLFLAG_RD, &num_aio_procs, 0,
138 	"Number of presently active kernel processes for async IO");
139 
140 /*
141  * The code will adjust the actual number of AIO processes towards this
142  * number when it gets a chance.
143  */
144 static int target_aio_procs = TARGET_AIO_PROCS;
145 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
146 	0, "Preferred number of ready kernel processes for async IO");
147 
148 static int max_queue_count = MAX_AIO_QUEUE;
149 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
150     "Maximum number of aio requests to queue, globally");
151 
152 static int num_queue_count = 0;
153 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
154     "Number of queued aio requests");
155 
156 static int num_buf_aio = 0;
157 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
158     "Number of aio requests presently handled by the buf subsystem");
159 
160 /* Number of async I/O processes in the process of being started */
161 /* XXX This should be local to aio_aqueue() */
162 static int num_aio_resv_start = 0;
163 
164 static int aiod_lifetime;
165 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
166     "Maximum lifetime for idle aiod");
167 
168 static int unloadable = 0;
169 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
170     "Allow unload of aio (not recommended)");
171 
172 
173 static int max_aio_per_proc = MAX_AIO_PER_PROC;
174 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
175     0, "Maximum active aio requests per process (stored in the process)");
176 
177 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
178 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
179     &max_aio_queue_per_proc, 0,
180     "Maximum queued aio requests per process (stored in the process)");
181 
182 static int max_buf_aio = MAX_BUF_AIO;
183 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
184     "Maximum buf aio requests per process (stored in the process)");
185 
186 typedef struct oaiocb {
187 	int	aio_fildes;		/* File descriptor */
188 	off_t	aio_offset;		/* File offset for I/O */
189 	volatile void *aio_buf;         /* I/O buffer in process space */
190 	size_t	aio_nbytes;		/* Number of bytes for I/O */
191 	struct	osigevent aio_sigevent;	/* Signal to deliver */
192 	int	aio_lio_opcode;		/* LIO opcode */
193 	int	aio_reqprio;		/* Request priority -- ignored */
194 	struct	__aiocb_private	_aiocb_private;
195 } oaiocb_t;
196 
197 /*
198  * Below is a key of locks used to protect each member of struct aiocblist
199  * aioliojob and kaioinfo and any backends.
200  *
201  * * - need not protected
202  * a - locked by kaioinfo lock
203  * b - locked by backend lock, the backend lock can be null in some cases,
204  *     for example, BIO belongs to this type, in this case, proc lock is
205  *     reused.
206  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
207  */
208 
209 /*
210  * Current, there is only two backends: BIO and generic file I/O.
211  * socket I/O is served by generic file I/O, this is not a good idea, since
212  * disk file I/O and any other types without O_NONBLOCK flag can block daemon
213  * processes, if there is no thread to serve socket I/O, the socket I/O will be
214  * delayed too long or starved, we should create some processes dedicated to
215  * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
216  * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
217  * structure is not safe because there is race between userland and aio
218  * daemons.
219  */
220 
221 struct aiocblist {
222 	TAILQ_ENTRY(aiocblist) list;	/* (b) internal list of for backend */
223 	TAILQ_ENTRY(aiocblist) plist;	/* (a) list of jobs for each backend */
224 	TAILQ_ENTRY(aiocblist) allist;  /* (a) list of all jobs in proc */
225 	int	jobflags;		/* (a) job flags */
226 	int	jobstate;		/* (b) job state */
227 	int	inputcharge;		/* (*) input blockes */
228 	int	outputcharge;		/* (*) output blockes */
229 	struct	bio *bp;		/* (*) BIO backend BIO pointer */
230 	struct	buf *pbuf;		/* (*) BIO backend buffer pointer */
231 	struct	vm_page *pages[btoc(MAXPHYS)+1]; /* BIO backend pages */
232 	int	npages;			/* BIO backend number of pages */
233 	struct	proc *userproc;		/* (*) user process */
234 	struct  ucred *cred;		/* (*) active credential when created */
235 	struct	file *fd_file;		/* (*) pointer to file structure */
236 	struct	aioliojob *lio;		/* (*) optional lio job */
237 	struct	aiocb *uuaiocb;		/* (*) pointer in userspace of aiocb */
238 	struct	knlist klist;		/* (a) list of knotes */
239 	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
240 	ksiginfo_t ksi;			/* (a) realtime signal info */
241 	uint64_t seqno;			/* (*) job number */
242 	int	pending;		/* (a) number of pending I/O, aio_fsync only */
243 };
244 
245 /* jobflags */
246 #define AIOCBLIST_DONE		0x01
247 #define AIOCBLIST_BUFDONE	0x02
248 #define AIOCBLIST_RUNDOWN	0x04
249 #define AIOCBLIST_CHECKSYNC	0x08
250 
251 /*
252  * AIO process info
253  */
254 #define AIOP_FREE	0x1			/* proc on free queue */
255 
256 struct aioproc {
257 	int aioprocflags;			/* (c) AIO proc flags */
258 	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
259 	struct proc *aioproc;			/* (*) the AIO proc */
260 };
261 
262 /*
263  * data-structure for lio signal management
264  */
265 struct aioliojob {
266 	int	lioj_flags;			/* (a) listio flags */
267 	int	lioj_count;			/* (a) listio flags */
268 	int	lioj_finished_count;		/* (a) listio flags */
269 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
270 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
271 	struct  knlist klist;			/* (a) list of knotes */
272 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
273 };
274 
275 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
276 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
277 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
278 
279 /*
280  * per process aio data structure
281  */
282 struct kaioinfo {
283 	struct mtx	kaio_mtx;	/* the lock to protect this struct */
284 	int	kaio_flags;		/* (a) per process kaio flags */
285 	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
286 	int	kaio_active_count;	/* (c) number of currently used AIOs */
287 	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
288 	int	kaio_count;		/* (a) size of AIO queue */
289 	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
290 	int	kaio_buffer_count;	/* (a) number of physio buffers */
291 	TAILQ_HEAD(,aiocblist) kaio_all;	/* (a) all AIOs in the process */
292 	TAILQ_HEAD(,aiocblist) kaio_done;	/* (a) done queue for process */
293 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
294 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* (a) job queue for process */
295 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* (a) buffer job queue for process */
296 	TAILQ_HEAD(,aiocblist) kaio_syncqueue;	/* (a) queue for aio_fsync */
297 	struct	task	kaio_task;	/* (*) task to kick aio processes */
298 };
299 
300 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
301 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
302 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
303 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
304 
305 #define KAIO_RUNDOWN	0x1	/* process is being run down */
306 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
307 
308 /*
309  * Operations used to interact with userland aio control blocks.
310  * Different ABIs provide their own operations.
311  */
312 struct aiocb_ops {
313 	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
314 	long	(*fetch_status)(struct aiocb *ujob);
315 	long	(*fetch_error)(struct aiocb *ujob);
316 	int	(*store_status)(struct aiocb *ujob, long status);
317 	int	(*store_error)(struct aiocb *ujob, long error);
318 	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
319 	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
320 };
321 
322 static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
323 static struct sema aio_newproc_sem;
324 static struct mtx aio_job_mtx;
325 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* (c) Async job list */
326 static struct unrhdr *aiod_unr;
327 
328 void		aio_init_aioinfo(struct proc *p);
329 static int	aio_onceonly(void);
330 static int	aio_free_entry(struct aiocblist *aiocbe);
331 static void	aio_process_rw(struct aiocblist *aiocbe);
332 static void	aio_process_sync(struct aiocblist *aiocbe);
333 static void	aio_process_mlock(struct aiocblist *aiocbe);
334 static int	aio_newproc(int *);
335 int		aio_aqueue(struct thread *td, struct aiocb *job,
336 			struct aioliojob *lio, int type, struct aiocb_ops *ops);
337 static void	aio_physwakeup(struct bio *bp);
338 static void	aio_proc_rundown(void *arg, struct proc *p);
339 static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
340 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
341 static void	aio_daemon(void *param);
342 static void	aio_swake_cb(struct socket *, struct sockbuf *);
343 static int	aio_unload(void);
344 static void	aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
345 #define DONE_BUF	1
346 #define DONE_QUEUE	2
347 static int	aio_kick(struct proc *userp);
348 static void	aio_kick_nowait(struct proc *userp);
349 static void	aio_kick_helper(void *context, int pending);
350 static int	filt_aioattach(struct knote *kn);
351 static void	filt_aiodetach(struct knote *kn);
352 static int	filt_aio(struct knote *kn, long hint);
353 static int	filt_lioattach(struct knote *kn);
354 static void	filt_liodetach(struct knote *kn);
355 static int	filt_lio(struct knote *kn, long hint);
356 
357 /*
358  * Zones for:
359  * 	kaio	Per process async io info
360  *	aiop	async io process data
361  *	aiocb	async io jobs
362  *	aiol	list io job pointer - internal to aio_suspend XXX
363  *	aiolio	list io jobs
364  */
365 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
366 
367 /* kqueue filters for aio */
368 static struct filterops aio_filtops = {
369 	.f_isfd = 0,
370 	.f_attach = filt_aioattach,
371 	.f_detach = filt_aiodetach,
372 	.f_event = filt_aio,
373 };
374 static struct filterops lio_filtops = {
375 	.f_isfd = 0,
376 	.f_attach = filt_lioattach,
377 	.f_detach = filt_liodetach,
378 	.f_event = filt_lio
379 };
380 
381 static eventhandler_tag exit_tag, exec_tag;
382 
383 TASKQUEUE_DEFINE_THREAD(aiod_kick);
384 
385 /*
386  * Main operations function for use as a kernel module.
387  */
388 static int
389 aio_modload(struct module *module, int cmd, void *arg)
390 {
391 	int error = 0;
392 
393 	switch (cmd) {
394 	case MOD_LOAD:
395 		aio_onceonly();
396 		break;
397 	case MOD_UNLOAD:
398 		error = aio_unload();
399 		break;
400 	case MOD_SHUTDOWN:
401 		break;
402 	default:
403 		error = EINVAL;
404 		break;
405 	}
406 	return (error);
407 }
408 
409 static moduledata_t aio_mod = {
410 	"aio",
411 	&aio_modload,
412 	NULL
413 };
414 
415 static struct syscall_helper_data aio_syscalls[] = {
416 	SYSCALL_INIT_HELPER(aio_cancel),
417 	SYSCALL_INIT_HELPER(aio_error),
418 	SYSCALL_INIT_HELPER(aio_fsync),
419 	SYSCALL_INIT_HELPER(aio_mlock),
420 	SYSCALL_INIT_HELPER(aio_read),
421 	SYSCALL_INIT_HELPER(aio_return),
422 	SYSCALL_INIT_HELPER(aio_suspend),
423 	SYSCALL_INIT_HELPER(aio_waitcomplete),
424 	SYSCALL_INIT_HELPER(aio_write),
425 	SYSCALL_INIT_HELPER(lio_listio),
426 	SYSCALL_INIT_HELPER(oaio_read),
427 	SYSCALL_INIT_HELPER(oaio_write),
428 	SYSCALL_INIT_HELPER(olio_listio),
429 	SYSCALL_INIT_LAST
430 };
431 
432 #ifdef COMPAT_FREEBSD32
433 #include <sys/mount.h>
434 #include <sys/socket.h>
435 #include <compat/freebsd32/freebsd32.h>
436 #include <compat/freebsd32/freebsd32_proto.h>
437 #include <compat/freebsd32/freebsd32_signal.h>
438 #include <compat/freebsd32/freebsd32_syscall.h>
439 #include <compat/freebsd32/freebsd32_util.h>
440 
441 static struct syscall_helper_data aio32_syscalls[] = {
442 	SYSCALL32_INIT_HELPER(freebsd32_aio_return),
443 	SYSCALL32_INIT_HELPER(freebsd32_aio_suspend),
444 	SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
445 	SYSCALL32_INIT_HELPER(freebsd32_aio_error),
446 	SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
447 	SYSCALL32_INIT_HELPER(freebsd32_aio_mlock),
448 	SYSCALL32_INIT_HELPER(freebsd32_aio_read),
449 	SYSCALL32_INIT_HELPER(freebsd32_aio_write),
450 	SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
451 	SYSCALL32_INIT_HELPER(freebsd32_lio_listio),
452 	SYSCALL32_INIT_HELPER(freebsd32_oaio_read),
453 	SYSCALL32_INIT_HELPER(freebsd32_oaio_write),
454 	SYSCALL32_INIT_HELPER(freebsd32_olio_listio),
455 	SYSCALL_INIT_LAST
456 };
457 #endif
458 
459 DECLARE_MODULE(aio, aio_mod,
460 	SI_SUB_VFS, SI_ORDER_ANY);
461 MODULE_VERSION(aio, 1);
462 
463 /*
464  * Startup initialization
465  */
466 static int
467 aio_onceonly(void)
468 {
469 	int error;
470 
471 	/* XXX: should probably just use so->callback */
472 	aio_swake = &aio_swake_cb;
473 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
474 	    EVENTHANDLER_PRI_ANY);
475 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
476 	    EVENTHANDLER_PRI_ANY);
477 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
478 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
479 	TAILQ_INIT(&aio_freeproc);
480 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
481 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
482 	TAILQ_INIT(&aio_jobs);
483 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
484 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
485 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
486 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
487 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
488 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
489 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
490 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
491 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
492 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
493 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
494 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
495 	jobrefid = 1;
496 	async_io_version = _POSIX_VERSION;
497 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
498 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
499 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
500 
501 	error = syscall_helper_register(aio_syscalls, SY_THR_STATIC_KLD);
502 	if (error)
503 		return (error);
504 #ifdef COMPAT_FREEBSD32
505 	error = syscall32_helper_register(aio32_syscalls, SY_THR_STATIC_KLD);
506 	if (error)
507 		return (error);
508 #endif
509 	return (0);
510 }
511 
512 /*
513  * Callback for unload of AIO when used as a module.
514  */
515 static int
516 aio_unload(void)
517 {
518 	int error;
519 
520 	/*
521 	 * XXX: no unloads by default, it's too dangerous.
522 	 * perhaps we could do it if locked out callers and then
523 	 * did an aio_proc_rundown() on each process.
524 	 *
525 	 * jhb: aio_proc_rundown() needs to run on curproc though,
526 	 * so I don't think that would fly.
527 	 */
528 	if (!unloadable)
529 		return (EOPNOTSUPP);
530 
531 #ifdef COMPAT_FREEBSD32
532 	syscall32_helper_unregister(aio32_syscalls);
533 #endif
534 	syscall_helper_unregister(aio_syscalls);
535 
536 	error = kqueue_del_filteropts(EVFILT_AIO);
537 	if (error)
538 		return error;
539 	error = kqueue_del_filteropts(EVFILT_LIO);
540 	if (error)
541 		return error;
542 	async_io_version = 0;
543 	aio_swake = NULL;
544 	taskqueue_free(taskqueue_aiod_kick);
545 	delete_unrhdr(aiod_unr);
546 	uma_zdestroy(kaio_zone);
547 	uma_zdestroy(aiop_zone);
548 	uma_zdestroy(aiocb_zone);
549 	uma_zdestroy(aiol_zone);
550 	uma_zdestroy(aiolio_zone);
551 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
552 	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
553 	mtx_destroy(&aio_job_mtx);
554 	sema_destroy(&aio_newproc_sem);
555 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
556 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
557 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
558 	return (0);
559 }
560 
561 /*
562  * Init the per-process aioinfo structure.  The aioinfo limits are set
563  * per-process for user limit (resource) management.
564  */
565 void
566 aio_init_aioinfo(struct proc *p)
567 {
568 	struct kaioinfo *ki;
569 
570 	ki = uma_zalloc(kaio_zone, M_WAITOK);
571 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
572 	ki->kaio_flags = 0;
573 	ki->kaio_maxactive_count = max_aio_per_proc;
574 	ki->kaio_active_count = 0;
575 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
576 	ki->kaio_count = 0;
577 	ki->kaio_ballowed_count = max_buf_aio;
578 	ki->kaio_buffer_count = 0;
579 	TAILQ_INIT(&ki->kaio_all);
580 	TAILQ_INIT(&ki->kaio_done);
581 	TAILQ_INIT(&ki->kaio_jobqueue);
582 	TAILQ_INIT(&ki->kaio_bufqueue);
583 	TAILQ_INIT(&ki->kaio_liojoblist);
584 	TAILQ_INIT(&ki->kaio_syncqueue);
585 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
586 	PROC_LOCK(p);
587 	if (p->p_aioinfo == NULL) {
588 		p->p_aioinfo = ki;
589 		PROC_UNLOCK(p);
590 	} else {
591 		PROC_UNLOCK(p);
592 		mtx_destroy(&ki->kaio_mtx);
593 		uma_zfree(kaio_zone, ki);
594 	}
595 
596 	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
597 		aio_newproc(NULL);
598 }
599 
600 static int
601 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
602 {
603 	struct thread *td;
604 	int error;
605 
606 	error = sigev_findtd(p, sigev, &td);
607 	if (error)
608 		return (error);
609 	if (!KSI_ONQ(ksi)) {
610 		ksiginfo_set_sigev(ksi, sigev);
611 		ksi->ksi_code = SI_ASYNCIO;
612 		ksi->ksi_flags |= KSI_EXT | KSI_INS;
613 		tdsendsignal(p, td, ksi->ksi_signo, ksi);
614 	}
615 	PROC_UNLOCK(p);
616 	return (error);
617 }
618 
619 /*
620  * Free a job entry.  Wait for completion if it is currently active, but don't
621  * delay forever.  If we delay, we return a flag that says that we have to
622  * restart the queue scan.
623  */
624 static int
625 aio_free_entry(struct aiocblist *aiocbe)
626 {
627 	struct kaioinfo *ki;
628 	struct aioliojob *lj;
629 	struct proc *p;
630 
631 	p = aiocbe->userproc;
632 	MPASS(curproc == p);
633 	ki = p->p_aioinfo;
634 	MPASS(ki != NULL);
635 
636 	AIO_LOCK_ASSERT(ki, MA_OWNED);
637 	MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
638 
639 	atomic_subtract_int(&num_queue_count, 1);
640 
641 	ki->kaio_count--;
642 	MPASS(ki->kaio_count >= 0);
643 
644 	TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
645 	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
646 
647 	lj = aiocbe->lio;
648 	if (lj) {
649 		lj->lioj_count--;
650 		lj->lioj_finished_count--;
651 
652 		if (lj->lioj_count == 0) {
653 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
654 			/* lio is going away, we need to destroy any knotes */
655 			knlist_delete(&lj->klist, curthread, 1);
656 			PROC_LOCK(p);
657 			sigqueue_take(&lj->lioj_ksi);
658 			PROC_UNLOCK(p);
659 			uma_zfree(aiolio_zone, lj);
660 		}
661 	}
662 
663 	/* aiocbe is going away, we need to destroy any knotes */
664 	knlist_delete(&aiocbe->klist, curthread, 1);
665 	PROC_LOCK(p);
666 	sigqueue_take(&aiocbe->ksi);
667 	PROC_UNLOCK(p);
668 
669 	MPASS(aiocbe->bp == NULL);
670 	aiocbe->jobstate = JOBST_NULL;
671 	AIO_UNLOCK(ki);
672 
673 	/*
674 	 * The thread argument here is used to find the owning process
675 	 * and is also passed to fo_close() which may pass it to various
676 	 * places such as devsw close() routines.  Because of that, we
677 	 * need a thread pointer from the process owning the job that is
678 	 * persistent and won't disappear out from under us or move to
679 	 * another process.
680 	 *
681 	 * Currently, all the callers of this function call it to remove
682 	 * an aiocblist from the current process' job list either via a
683 	 * syscall or due to the current process calling exit() or
684 	 * execve().  Thus, we know that p == curproc.  We also know that
685 	 * curthread can't exit since we are curthread.
686 	 *
687 	 * Therefore, we use curthread as the thread to pass to
688 	 * knlist_delete().  This does mean that it is possible for the
689 	 * thread pointer at close time to differ from the thread pointer
690 	 * at open time, but this is already true of file descriptors in
691 	 * a multithreaded process.
692 	 */
693 	if (aiocbe->fd_file)
694 		fdrop(aiocbe->fd_file, curthread);
695 	crfree(aiocbe->cred);
696 	uma_zfree(aiocb_zone, aiocbe);
697 	AIO_LOCK(ki);
698 
699 	return (0);
700 }
701 
702 static void
703 aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
704 {
705    	aio_proc_rundown(arg, p);
706 }
707 
708 /*
709  * Rundown the jobs for a given process.
710  */
711 static void
712 aio_proc_rundown(void *arg, struct proc *p)
713 {
714 	struct kaioinfo *ki;
715 	struct aioliojob *lj;
716 	struct aiocblist *cbe, *cbn;
717 	struct file *fp;
718 	struct socket *so;
719 	int remove;
720 
721 	KASSERT(curthread->td_proc == p,
722 	    ("%s: called on non-curproc", __func__));
723 	ki = p->p_aioinfo;
724 	if (ki == NULL)
725 		return;
726 
727 	AIO_LOCK(ki);
728 	ki->kaio_flags |= KAIO_RUNDOWN;
729 
730 restart:
731 
732 	/*
733 	 * Try to cancel all pending requests. This code simulates
734 	 * aio_cancel on all pending I/O requests.
735 	 */
736 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
737 		remove = 0;
738 		mtx_lock(&aio_job_mtx);
739 		if (cbe->jobstate == JOBST_JOBQGLOBAL) {
740 			TAILQ_REMOVE(&aio_jobs, cbe, list);
741 			remove = 1;
742 		} else if (cbe->jobstate == JOBST_JOBQSOCK) {
743 			fp = cbe->fd_file;
744 			MPASS(fp->f_type == DTYPE_SOCKET);
745 			so = fp->f_data;
746 			TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
747 			remove = 1;
748 		} else if (cbe->jobstate == JOBST_JOBQSYNC) {
749 			TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
750 			remove = 1;
751 		}
752 		mtx_unlock(&aio_job_mtx);
753 
754 		if (remove) {
755 			cbe->jobstate = JOBST_JOBFINISHED;
756 			cbe->uaiocb._aiocb_private.status = -1;
757 			cbe->uaiocb._aiocb_private.error = ECANCELED;
758 			TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
759 			aio_bio_done_notify(p, cbe, DONE_QUEUE);
760 		}
761 	}
762 
763 	/* Wait for all running I/O to be finished */
764 	if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
765 	    TAILQ_FIRST(&ki->kaio_jobqueue)) {
766 		ki->kaio_flags |= KAIO_WAKEUP;
767 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
768 		goto restart;
769 	}
770 
771 	/* Free all completed I/O requests. */
772 	while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
773 		aio_free_entry(cbe);
774 
775 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
776 		if (lj->lioj_count == 0) {
777 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
778 			knlist_delete(&lj->klist, curthread, 1);
779 			PROC_LOCK(p);
780 			sigqueue_take(&lj->lioj_ksi);
781 			PROC_UNLOCK(p);
782 			uma_zfree(aiolio_zone, lj);
783 		} else {
784 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
785 			    lj->lioj_count, lj->lioj_finished_count);
786 		}
787 	}
788 	AIO_UNLOCK(ki);
789 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
790 	mtx_destroy(&ki->kaio_mtx);
791 	uma_zfree(kaio_zone, ki);
792 	p->p_aioinfo = NULL;
793 }
794 
795 /*
796  * Select a job to run (called by an AIO daemon).
797  */
798 static struct aiocblist *
799 aio_selectjob(struct aioproc *aiop)
800 {
801 	struct aiocblist *aiocbe;
802 	struct kaioinfo *ki;
803 	struct proc *userp;
804 
805 	mtx_assert(&aio_job_mtx, MA_OWNED);
806 	TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
807 		userp = aiocbe->userproc;
808 		ki = userp->p_aioinfo;
809 
810 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
811 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
812 			/* Account for currently active jobs. */
813 			ki->kaio_active_count++;
814 			aiocbe->jobstate = JOBST_JOBRUNNING;
815 			break;
816 		}
817 	}
818 	return (aiocbe);
819 }
820 
821 /*
822  *  Move all data to a permanent storage device, this code
823  *  simulates fsync syscall.
824  */
825 static int
826 aio_fsync_vnode(struct thread *td, struct vnode *vp)
827 {
828 	struct mount *mp;
829 	int error;
830 
831 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
832 		goto drop;
833 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
834 	if (vp->v_object != NULL) {
835 		VM_OBJECT_WLOCK(vp->v_object);
836 		vm_object_page_clean(vp->v_object, 0, 0, 0);
837 		VM_OBJECT_WUNLOCK(vp->v_object);
838 	}
839 	error = VOP_FSYNC(vp, MNT_WAIT, td);
840 
841 	VOP_UNLOCK(vp, 0);
842 	vn_finished_write(mp);
843 drop:
844 	return (error);
845 }
846 
847 /*
848  * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
849  * does the I/O request for the non-physio version of the operations.  The
850  * normal vn operations are used, and this code should work in all instances
851  * for every type of file, including pipes, sockets, fifos, and regular files.
852  *
853  * XXX I don't think it works well for socket, pipe, and fifo.
854  */
855 static void
856 aio_process_rw(struct aiocblist *aiocbe)
857 {
858 	struct ucred *td_savedcred;
859 	struct thread *td;
860 	struct aiocb *cb;
861 	struct file *fp;
862 	struct socket *so;
863 	struct uio auio;
864 	struct iovec aiov;
865 	int cnt;
866 	int error;
867 	int oublock_st, oublock_end;
868 	int inblock_st, inblock_end;
869 
870 	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_READ ||
871 	    aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE,
872 	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
873 
874 	td = curthread;
875 	td_savedcred = td->td_ucred;
876 	td->td_ucred = aiocbe->cred;
877 	cb = &aiocbe->uaiocb;
878 	fp = aiocbe->fd_file;
879 
880 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
881 	aiov.iov_len = cb->aio_nbytes;
882 
883 	auio.uio_iov = &aiov;
884 	auio.uio_iovcnt = 1;
885 	auio.uio_offset = cb->aio_offset;
886 	auio.uio_resid = cb->aio_nbytes;
887 	cnt = cb->aio_nbytes;
888 	auio.uio_segflg = UIO_USERSPACE;
889 	auio.uio_td = td;
890 
891 	inblock_st = td->td_ru.ru_inblock;
892 	oublock_st = td->td_ru.ru_oublock;
893 	/*
894 	 * aio_aqueue() acquires a reference to the file that is
895 	 * released in aio_free_entry().
896 	 */
897 	if (cb->aio_lio_opcode == LIO_READ) {
898 		auio.uio_rw = UIO_READ;
899 		if (auio.uio_resid == 0)
900 			error = 0;
901 		else
902 			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
903 	} else {
904 		if (fp->f_type == DTYPE_VNODE)
905 			bwillwrite();
906 		auio.uio_rw = UIO_WRITE;
907 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
908 	}
909 	inblock_end = td->td_ru.ru_inblock;
910 	oublock_end = td->td_ru.ru_oublock;
911 
912 	aiocbe->inputcharge = inblock_end - inblock_st;
913 	aiocbe->outputcharge = oublock_end - oublock_st;
914 
915 	if ((error) && (auio.uio_resid != cnt)) {
916 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
917 			error = 0;
918 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
919 			int sigpipe = 1;
920 			if (fp->f_type == DTYPE_SOCKET) {
921 				so = fp->f_data;
922 				if (so->so_options & SO_NOSIGPIPE)
923 					sigpipe = 0;
924 			}
925 			if (sigpipe) {
926 				PROC_LOCK(aiocbe->userproc);
927 				kern_psignal(aiocbe->userproc, SIGPIPE);
928 				PROC_UNLOCK(aiocbe->userproc);
929 			}
930 		}
931 	}
932 
933 	cnt -= auio.uio_resid;
934 	cb->_aiocb_private.error = error;
935 	cb->_aiocb_private.status = cnt;
936 	td->td_ucred = td_savedcred;
937 }
938 
939 static void
940 aio_process_sync(struct aiocblist *aiocbe)
941 {
942 	struct thread *td = curthread;
943 	struct ucred *td_savedcred = td->td_ucred;
944 	struct aiocb *cb = &aiocbe->uaiocb;
945 	struct file *fp = aiocbe->fd_file;
946 	int error = 0;
947 
948 	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_SYNC,
949 	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
950 
951 	td->td_ucred = aiocbe->cred;
952 	if (fp->f_vnode != NULL)
953 		error = aio_fsync_vnode(td, fp->f_vnode);
954 	cb->_aiocb_private.error = error;
955 	cb->_aiocb_private.status = 0;
956 	td->td_ucred = td_savedcred;
957 }
958 
959 static void
960 aio_process_mlock(struct aiocblist *aiocbe)
961 {
962 	struct aiocb *cb = &aiocbe->uaiocb;
963 	int error;
964 
965 	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_MLOCK,
966 	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
967 
968 	error = vm_mlock(aiocbe->userproc, aiocbe->cred,
969 	    __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
970 	cb->_aiocb_private.error = error;
971 	cb->_aiocb_private.status = 0;
972 }
973 
974 static void
975 aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
976 {
977 	struct aioliojob *lj;
978 	struct kaioinfo *ki;
979 	struct aiocblist *scb, *scbn;
980 	int lj_done;
981 
982 	ki = userp->p_aioinfo;
983 	AIO_LOCK_ASSERT(ki, MA_OWNED);
984 	lj = aiocbe->lio;
985 	lj_done = 0;
986 	if (lj) {
987 		lj->lioj_finished_count++;
988 		if (lj->lioj_count == lj->lioj_finished_count)
989 			lj_done = 1;
990 	}
991 	if (type == DONE_QUEUE) {
992 		aiocbe->jobflags |= AIOCBLIST_DONE;
993 	} else {
994 		aiocbe->jobflags |= AIOCBLIST_BUFDONE;
995 	}
996 	TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
997 	aiocbe->jobstate = JOBST_JOBFINISHED;
998 
999 	if (ki->kaio_flags & KAIO_RUNDOWN)
1000 		goto notification_done;
1001 
1002 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1003 	    aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
1004 		aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
1005 
1006 	KNOTE_LOCKED(&aiocbe->klist, 1);
1007 
1008 	if (lj_done) {
1009 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
1010 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
1011 			KNOTE_LOCKED(&lj->klist, 1);
1012 		}
1013 		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
1014 		    == LIOJ_SIGNAL
1015 		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
1016 		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
1017 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
1018 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
1019 		}
1020 	}
1021 
1022 notification_done:
1023 	if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
1024 		TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
1025 			if (aiocbe->fd_file == scb->fd_file &&
1026 			    aiocbe->seqno < scb->seqno) {
1027 				if (--scb->pending == 0) {
1028 					mtx_lock(&aio_job_mtx);
1029 					scb->jobstate = JOBST_JOBQGLOBAL;
1030 					TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
1031 					TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
1032 					aio_kick_nowait(userp);
1033 					mtx_unlock(&aio_job_mtx);
1034 				}
1035 			}
1036 		}
1037 	}
1038 	if (ki->kaio_flags & KAIO_WAKEUP) {
1039 		ki->kaio_flags &= ~KAIO_WAKEUP;
1040 		wakeup(&userp->p_aioinfo);
1041 	}
1042 }
1043 
1044 static void
1045 aio_switch_vmspace(struct aiocblist *aiocbe)
1046 {
1047 
1048 	vmspace_switch_aio(aiocbe->userproc->p_vmspace);
1049 }
1050 
1051 /*
1052  * The AIO daemon, most of the actual work is done in aio_process_*,
1053  * but the setup (and address space mgmt) is done in this routine.
1054  */
1055 static void
1056 aio_daemon(void *_id)
1057 {
1058 	struct aiocblist *aiocbe;
1059 	struct aioproc *aiop;
1060 	struct kaioinfo *ki;
1061 	struct proc *p, *userp;
1062 	struct vmspace *myvm;
1063 	struct thread *td = curthread;
1064 	int id = (intptr_t)_id;
1065 
1066 	/*
1067 	 * Grab an extra reference on the daemon's vmspace so that it
1068 	 * doesn't get freed by jobs that switch to a different
1069 	 * vmspace.
1070 	 */
1071 	p = td->td_proc;
1072 	myvm = vmspace_acquire_ref(p);
1073 
1074 	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
1075 
1076 	/*
1077 	 * Allocate and ready the aio control info.  There is one aiop structure
1078 	 * per daemon.
1079 	 */
1080 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
1081 	aiop->aioproc = p;
1082 	aiop->aioprocflags = 0;
1083 
1084 	/*
1085 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
1086 	 * and creating too many daemons.)
1087 	 */
1088 	sema_post(&aio_newproc_sem);
1089 
1090 	mtx_lock(&aio_job_mtx);
1091 	for (;;) {
1092 		/*
1093 		 * Take daemon off of free queue
1094 		 */
1095 		if (aiop->aioprocflags & AIOP_FREE) {
1096 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1097 			aiop->aioprocflags &= ~AIOP_FREE;
1098 		}
1099 
1100 		/*
1101 		 * Check for jobs.
1102 		 */
1103 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
1104 			mtx_unlock(&aio_job_mtx);
1105 			userp = aiocbe->userproc;
1106 
1107 			/*
1108 			 * Connect to process address space for user program.
1109 			 */
1110 			aio_switch_vmspace(aiocbe);
1111 
1112 			ki = userp->p_aioinfo;
1113 
1114 			/* Do the I/O function. */
1115 			switch(aiocbe->uaiocb.aio_lio_opcode) {
1116 			case LIO_READ:
1117 			case LIO_WRITE:
1118 				aio_process_rw(aiocbe);
1119 				break;
1120 			case LIO_SYNC:
1121 				aio_process_sync(aiocbe);
1122 				break;
1123 			case LIO_MLOCK:
1124 				aio_process_mlock(aiocbe);
1125 				break;
1126 			}
1127 
1128 			mtx_lock(&aio_job_mtx);
1129 			/* Decrement the active job count. */
1130 			ki->kaio_active_count--;
1131 			mtx_unlock(&aio_job_mtx);
1132 
1133 			AIO_LOCK(ki);
1134 			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
1135 			aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
1136 			AIO_UNLOCK(ki);
1137 
1138 			mtx_lock(&aio_job_mtx);
1139 		}
1140 
1141 		/*
1142 		 * Disconnect from user address space.
1143 		 */
1144 		if (p->p_vmspace != myvm) {
1145 			mtx_unlock(&aio_job_mtx);
1146 			vmspace_switch_aio(myvm);
1147 			mtx_lock(&aio_job_mtx);
1148 			/*
1149 			 * We have to restart to avoid race, we only sleep if
1150 			 * no job can be selected.
1151 			 */
1152 			continue;
1153 		}
1154 
1155 		mtx_assert(&aio_job_mtx, MA_OWNED);
1156 
1157 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
1158 		aiop->aioprocflags |= AIOP_FREE;
1159 
1160 		/*
1161 		 * If daemon is inactive for a long time, allow it to exit,
1162 		 * thereby freeing resources.
1163 		 */
1164 		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
1165 		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
1166 		    (aiop->aioprocflags & AIOP_FREE) &&
1167 		    num_aio_procs > target_aio_procs)
1168 			break;
1169 	}
1170 	TAILQ_REMOVE(&aio_freeproc, aiop, list);
1171 	num_aio_procs--;
1172 	mtx_unlock(&aio_job_mtx);
1173 	uma_zfree(aiop_zone, aiop);
1174 	free_unr(aiod_unr, id);
1175 	vmspace_free(myvm);
1176 
1177 	KASSERT(p->p_vmspace == myvm,
1178 	    ("AIOD: bad vmspace for exiting daemon"));
1179 	KASSERT(myvm->vm_refcnt > 1,
1180 	    ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
1181 	kproc_exit(0);
1182 }
1183 
1184 /*
1185  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
1186  * AIO daemon modifies its environment itself.
1187  */
1188 static int
1189 aio_newproc(int *start)
1190 {
1191 	int error;
1192 	struct proc *p;
1193 	int id;
1194 
1195 	id = alloc_unr(aiod_unr);
1196 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
1197 		RFNOWAIT, 0, "aiod%d", id);
1198 	if (error == 0) {
1199 		/*
1200 		 * Wait until daemon is started.
1201 		 */
1202 		sema_wait(&aio_newproc_sem);
1203 		mtx_lock(&aio_job_mtx);
1204 		num_aio_procs++;
1205 		if (start != NULL)
1206 			(*start)--;
1207 		mtx_unlock(&aio_job_mtx);
1208 	} else {
1209 		free_unr(aiod_unr, id);
1210 	}
1211 	return (error);
1212 }
1213 
1214 /*
1215  * Try the high-performance, low-overhead physio method for eligible
1216  * VCHR devices.  This method doesn't use an aio helper thread, and
1217  * thus has very low overhead.
1218  *
1219  * Assumes that the caller, aio_aqueue(), has incremented the file
1220  * structure's reference count, preventing its deallocation for the
1221  * duration of this call.
1222  */
1223 static int
1224 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1225 {
1226 	struct aiocb *cb;
1227 	struct file *fp;
1228 	struct bio *bp;
1229 	struct buf *pbuf;
1230 	struct vnode *vp;
1231 	struct cdevsw *csw;
1232 	struct cdev *dev;
1233 	struct kaioinfo *ki;
1234 	struct aioliojob *lj;
1235 	int error, ref, unmap, poff;
1236 	vm_prot_t prot;
1237 
1238 	cb = &aiocbe->uaiocb;
1239 	fp = aiocbe->fd_file;
1240 
1241 	if (fp == NULL || fp->f_type != DTYPE_VNODE)
1242 		return (-1);
1243 
1244 	vp = fp->f_vnode;
1245 	if (vp->v_type != VCHR)
1246 		return (-1);
1247 	if (vp->v_bufobj.bo_bsize == 0)
1248 		return (-1);
1249 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
1250 		return (-1);
1251 
1252 	ref = 0;
1253 	csw = devvn_refthread(vp, &dev, &ref);
1254 	if (csw == NULL)
1255 		return (ENXIO);
1256 
1257 	if ((csw->d_flags & D_DISK) == 0) {
1258 		error = -1;
1259 		goto unref;
1260 	}
1261 	if (cb->aio_nbytes > dev->si_iosize_max) {
1262 		error = -1;
1263 		goto unref;
1264 	}
1265 
1266 	ki = p->p_aioinfo;
1267 	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
1268 	unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed);
1269 	if (unmap) {
1270 		if (cb->aio_nbytes > MAXPHYS) {
1271 			error = -1;
1272 			goto unref;
1273 		}
1274 	} else {
1275 		if (cb->aio_nbytes > MAXPHYS - poff) {
1276 			error = -1;
1277 			goto unref;
1278 		}
1279 		if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
1280 			error = -1;
1281 			goto unref;
1282 		}
1283 	}
1284 	aiocbe->bp = bp = g_alloc_bio();
1285 	if (!unmap) {
1286 		aiocbe->pbuf = pbuf = (struct buf *)getpbuf(NULL);
1287 		BUF_KERNPROC(pbuf);
1288 	}
1289 
1290 	AIO_LOCK(ki);
1291 	ki->kaio_count++;
1292 	if (!unmap)
1293 		ki->kaio_buffer_count++;
1294 	lj = aiocbe->lio;
1295 	if (lj)
1296 		lj->lioj_count++;
1297 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1298 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1299 	aiocbe->jobstate = JOBST_JOBQBUF;
1300 	cb->_aiocb_private.status = cb->aio_nbytes;
1301 	AIO_UNLOCK(ki);
1302 
1303 	bp->bio_length = cb->aio_nbytes;
1304 	bp->bio_bcount = cb->aio_nbytes;
1305 	bp->bio_done = aio_physwakeup;
1306 	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
1307 	bp->bio_offset = cb->aio_offset;
1308 	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
1309 	bp->bio_dev = dev;
1310 	bp->bio_caller1 = (void *)aiocbe;
1311 
1312 	prot = VM_PROT_READ;
1313 	if (cb->aio_lio_opcode == LIO_READ)
1314 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
1315 	if ((aiocbe->npages = vm_fault_quick_hold_pages(
1316 	    &curproc->p_vmspace->vm_map,
1317 	    (vm_offset_t)bp->bio_data, bp->bio_length, prot, aiocbe->pages,
1318 	    sizeof(aiocbe->pages)/sizeof(aiocbe->pages[0]))) < 0) {
1319 		error = EFAULT;
1320 		goto doerror;
1321 	}
1322 	if (!unmap) {
1323 		pmap_qenter((vm_offset_t)pbuf->b_data,
1324 		    aiocbe->pages, aiocbe->npages);
1325 		bp->bio_data = pbuf->b_data + poff;
1326 	} else {
1327 		bp->bio_ma = aiocbe->pages;
1328 		bp->bio_ma_n = aiocbe->npages;
1329 		bp->bio_ma_offset = poff;
1330 		bp->bio_data = unmapped_buf;
1331 		bp->bio_flags |= BIO_UNMAPPED;
1332 	}
1333 
1334 	atomic_add_int(&num_queue_count, 1);
1335 	if (!unmap)
1336 		atomic_add_int(&num_buf_aio, 1);
1337 
1338 	/* Perform transfer. */
1339 	csw->d_strategy(bp);
1340 	dev_relthread(dev, ref);
1341 	return (0);
1342 
1343 doerror:
1344 	AIO_LOCK(ki);
1345 	aiocbe->jobstate = JOBST_NULL;
1346 	TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1347 	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
1348 	ki->kaio_count--;
1349 	if (!unmap)
1350 		ki->kaio_buffer_count--;
1351 	if (lj)
1352 		lj->lioj_count--;
1353 	AIO_UNLOCK(ki);
1354 	if (pbuf) {
1355 		relpbuf(pbuf, NULL);
1356 		aiocbe->pbuf = NULL;
1357 	}
1358 	g_destroy_bio(bp);
1359 	aiocbe->bp = NULL;
1360 unref:
1361 	dev_relthread(dev, ref);
1362 	return (error);
1363 }
1364 
1365 /*
1366  * Wake up aio requests that may be serviceable now.
1367  */
1368 static void
1369 aio_swake_cb(struct socket *so, struct sockbuf *sb)
1370 {
1371 	struct aiocblist *cb, *cbn;
1372 	int opcode;
1373 
1374 	SOCKBUF_LOCK_ASSERT(sb);
1375 	if (sb == &so->so_snd)
1376 		opcode = LIO_WRITE;
1377 	else
1378 		opcode = LIO_READ;
1379 
1380 	sb->sb_flags &= ~SB_AIO;
1381 	mtx_lock(&aio_job_mtx);
1382 	TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
1383 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1384 			if (cb->jobstate != JOBST_JOBQSOCK)
1385 				panic("invalid queue value");
1386 			/* XXX
1387 			 * We don't have actual sockets backend yet,
1388 			 * so we simply move the requests to the generic
1389 			 * file I/O backend.
1390 			 */
1391 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1392 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1393 			aio_kick_nowait(cb->userproc);
1394 		}
1395 	}
1396 	mtx_unlock(&aio_job_mtx);
1397 }
1398 
1399 static int
1400 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
1401 {
1402 
1403 	/*
1404 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
1405 	 * supported by AIO with the old sigevent structure.
1406 	 */
1407 	nsig->sigev_notify = osig->sigev_notify;
1408 	switch (nsig->sigev_notify) {
1409 	case SIGEV_NONE:
1410 		break;
1411 	case SIGEV_SIGNAL:
1412 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
1413 		break;
1414 	case SIGEV_KEVENT:
1415 		nsig->sigev_notify_kqueue =
1416 		    osig->__sigev_u.__sigev_notify_kqueue;
1417 		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
1418 		break;
1419 	default:
1420 		return (EINVAL);
1421 	}
1422 	return (0);
1423 }
1424 
1425 static int
1426 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
1427 {
1428 	struct oaiocb *ojob;
1429 	int error;
1430 
1431 	bzero(kjob, sizeof(struct aiocb));
1432 	error = copyin(ujob, kjob, sizeof(struct oaiocb));
1433 	if (error)
1434 		return (error);
1435 	ojob = (struct oaiocb *)kjob;
1436 	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
1437 }
1438 
1439 static int
1440 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
1441 {
1442 
1443 	return (copyin(ujob, kjob, sizeof(struct aiocb)));
1444 }
1445 
1446 static long
1447 aiocb_fetch_status(struct aiocb *ujob)
1448 {
1449 
1450 	return (fuword(&ujob->_aiocb_private.status));
1451 }
1452 
1453 static long
1454 aiocb_fetch_error(struct aiocb *ujob)
1455 {
1456 
1457 	return (fuword(&ujob->_aiocb_private.error));
1458 }
1459 
1460 static int
1461 aiocb_store_status(struct aiocb *ujob, long status)
1462 {
1463 
1464 	return (suword(&ujob->_aiocb_private.status, status));
1465 }
1466 
1467 static int
1468 aiocb_store_error(struct aiocb *ujob, long error)
1469 {
1470 
1471 	return (suword(&ujob->_aiocb_private.error, error));
1472 }
1473 
1474 static int
1475 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
1476 {
1477 
1478 	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
1479 }
1480 
1481 static int
1482 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
1483 {
1484 
1485 	return (suword(ujobp, (long)ujob));
1486 }
1487 
1488 static struct aiocb_ops aiocb_ops = {
1489 	.copyin = aiocb_copyin,
1490 	.fetch_status = aiocb_fetch_status,
1491 	.fetch_error = aiocb_fetch_error,
1492 	.store_status = aiocb_store_status,
1493 	.store_error = aiocb_store_error,
1494 	.store_kernelinfo = aiocb_store_kernelinfo,
1495 	.store_aiocb = aiocb_store_aiocb,
1496 };
1497 
1498 static struct aiocb_ops aiocb_ops_osigevent = {
1499 	.copyin = aiocb_copyin_old_sigevent,
1500 	.fetch_status = aiocb_fetch_status,
1501 	.fetch_error = aiocb_fetch_error,
1502 	.store_status = aiocb_store_status,
1503 	.store_error = aiocb_store_error,
1504 	.store_kernelinfo = aiocb_store_kernelinfo,
1505 	.store_aiocb = aiocb_store_aiocb,
1506 };
1507 
1508 /*
1509  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1510  * technique is done in this code.
1511  */
1512 int
1513 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
1514 	int type, struct aiocb_ops *ops)
1515 {
1516 	struct proc *p = td->td_proc;
1517 	cap_rights_t rights;
1518 	struct file *fp;
1519 	struct socket *so;
1520 	struct aiocblist *aiocbe, *cb;
1521 	struct kaioinfo *ki;
1522 	struct kevent kev;
1523 	struct sockbuf *sb;
1524 	int opcode;
1525 	int error;
1526 	int fd, kqfd;
1527 	int jid;
1528 	u_short evflags;
1529 
1530 	if (p->p_aioinfo == NULL)
1531 		aio_init_aioinfo(p);
1532 
1533 	ki = p->p_aioinfo;
1534 
1535 	ops->store_status(job, -1);
1536 	ops->store_error(job, 0);
1537 	ops->store_kernelinfo(job, -1);
1538 
1539 	if (num_queue_count >= max_queue_count ||
1540 	    ki->kaio_count >= ki->kaio_qallowed_count) {
1541 		ops->store_error(job, EAGAIN);
1542 		return (EAGAIN);
1543 	}
1544 
1545 	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
1546 	knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
1547 
1548 	error = ops->copyin(job, &aiocbe->uaiocb);
1549 	if (error) {
1550 		ops->store_error(job, error);
1551 		uma_zfree(aiocb_zone, aiocbe);
1552 		return (error);
1553 	}
1554 
1555 	/* XXX: aio_nbytes is later casted to signed types. */
1556 	if (aiocbe->uaiocb.aio_nbytes > INT_MAX) {
1557 		uma_zfree(aiocb_zone, aiocbe);
1558 		return (EINVAL);
1559 	}
1560 
1561 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
1562 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
1563 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
1564 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
1565 		ops->store_error(job, EINVAL);
1566 		uma_zfree(aiocb_zone, aiocbe);
1567 		return (EINVAL);
1568 	}
1569 
1570 	if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1571 	     aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
1572 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1573 		uma_zfree(aiocb_zone, aiocbe);
1574 		return (EINVAL);
1575 	}
1576 
1577 	ksiginfo_init(&aiocbe->ksi);
1578 
1579 	/* Save userspace address of the job info. */
1580 	aiocbe->uuaiocb = job;
1581 
1582 	/* Get the opcode. */
1583 	if (type != LIO_NOP)
1584 		aiocbe->uaiocb.aio_lio_opcode = type;
1585 	opcode = aiocbe->uaiocb.aio_lio_opcode;
1586 
1587 	/*
1588 	 * Validate the opcode and fetch the file object for the specified
1589 	 * file descriptor.
1590 	 *
1591 	 * XXXRW: Moved the opcode validation up here so that we don't
1592 	 * retrieve a file descriptor without knowing what the capabiltity
1593 	 * should be.
1594 	 */
1595 	fd = aiocbe->uaiocb.aio_fildes;
1596 	switch (opcode) {
1597 	case LIO_WRITE:
1598 		error = fget_write(td, fd,
1599 		    cap_rights_init(&rights, CAP_PWRITE), &fp);
1600 		break;
1601 	case LIO_READ:
1602 		error = fget_read(td, fd,
1603 		    cap_rights_init(&rights, CAP_PREAD), &fp);
1604 		break;
1605 	case LIO_SYNC:
1606 		error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
1607 		break;
1608 	case LIO_MLOCK:
1609 		fp = NULL;
1610 		break;
1611 	case LIO_NOP:
1612 		error = fget(td, fd, cap_rights_init(&rights), &fp);
1613 		break;
1614 	default:
1615 		error = EINVAL;
1616 	}
1617 	if (error) {
1618 		uma_zfree(aiocb_zone, aiocbe);
1619 		ops->store_error(job, error);
1620 		return (error);
1621 	}
1622 
1623 	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
1624 		error = EINVAL;
1625 		goto aqueue_fail;
1626 	}
1627 
1628 	if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
1629 		error = EINVAL;
1630 		goto aqueue_fail;
1631 	}
1632 
1633 	aiocbe->fd_file = fp;
1634 
1635 	mtx_lock(&aio_job_mtx);
1636 	jid = jobrefid++;
1637 	aiocbe->seqno = jobseqno++;
1638 	mtx_unlock(&aio_job_mtx);
1639 	error = ops->store_kernelinfo(job, jid);
1640 	if (error) {
1641 		error = EINVAL;
1642 		goto aqueue_fail;
1643 	}
1644 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
1645 
1646 	if (opcode == LIO_NOP) {
1647 		fdrop(fp, td);
1648 		uma_zfree(aiocb_zone, aiocbe);
1649 		return (0);
1650 	}
1651 
1652 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
1653 		goto no_kqueue;
1654 	evflags = aiocbe->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
1655 	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
1656 		error = EINVAL;
1657 		goto aqueue_fail;
1658 	}
1659 	kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1660 	kev.ident = (uintptr_t)aiocbe->uuaiocb;
1661 	kev.filter = EVFILT_AIO;
1662 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
1663 	kev.data = (intptr_t)aiocbe;
1664 	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
1665 	error = kqfd_register(kqfd, &kev, td, 1);
1666 aqueue_fail:
1667 	if (error) {
1668 		if (fp)
1669 			fdrop(fp, td);
1670 		uma_zfree(aiocb_zone, aiocbe);
1671 		ops->store_error(job, error);
1672 		goto done;
1673 	}
1674 no_kqueue:
1675 
1676 	ops->store_error(job, EINPROGRESS);
1677 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1678 	aiocbe->userproc = p;
1679 	aiocbe->cred = crhold(td->td_ucred);
1680 	aiocbe->jobflags = 0;
1681 	aiocbe->lio = lj;
1682 
1683 	if (opcode == LIO_SYNC)
1684 		goto queueit;
1685 
1686 	if (fp && fp->f_type == DTYPE_SOCKET) {
1687 		/*
1688 		 * Alternate queueing for socket ops: Reach down into the
1689 		 * descriptor to get the socket data.  Then check to see if the
1690 		 * socket is ready to be read or written (based on the requested
1691 		 * operation).
1692 		 *
1693 		 * If it is not ready for io, then queue the aiocbe on the
1694 		 * socket, and set the flags so we get a call when sbnotify()
1695 		 * happens.
1696 		 *
1697 		 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock
1698 		 * and unlock the snd sockbuf for no reason.
1699 		 */
1700 		so = fp->f_data;
1701 		sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
1702 		SOCKBUF_LOCK(sb);
1703 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1704 		    LIO_WRITE) && (!sowriteable(so)))) {
1705 			sb->sb_flags |= SB_AIO;
1706 
1707 			mtx_lock(&aio_job_mtx);
1708 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1709 			mtx_unlock(&aio_job_mtx);
1710 
1711 			AIO_LOCK(ki);
1712 			TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1713 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1714 			aiocbe->jobstate = JOBST_JOBQSOCK;
1715 			ki->kaio_count++;
1716 			if (lj)
1717 				lj->lioj_count++;
1718 			AIO_UNLOCK(ki);
1719 			SOCKBUF_UNLOCK(sb);
1720 			atomic_add_int(&num_queue_count, 1);
1721 			error = 0;
1722 			goto done;
1723 		}
1724 		SOCKBUF_UNLOCK(sb);
1725 	}
1726 
1727 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1728 		goto done;
1729 #if 0
1730 	if (error > 0) {
1731 		aiocbe->uaiocb._aiocb_private.error = error;
1732 		ops->store_error(job, error);
1733 		goto done;
1734 	}
1735 #endif
1736 queueit:
1737 	atomic_add_int(&num_queue_count, 1);
1738 
1739 	AIO_LOCK(ki);
1740 	ki->kaio_count++;
1741 	if (lj)
1742 		lj->lioj_count++;
1743 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1744 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1745 	if (opcode == LIO_SYNC) {
1746 		TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
1747 			if (cb->fd_file == aiocbe->fd_file &&
1748 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
1749 			    cb->seqno < aiocbe->seqno) {
1750 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
1751 				aiocbe->pending++;
1752 			}
1753 		}
1754 		TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
1755 			if (cb->fd_file == aiocbe->fd_file &&
1756 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
1757 			    cb->seqno < aiocbe->seqno) {
1758 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
1759 				aiocbe->pending++;
1760 			}
1761 		}
1762 		if (aiocbe->pending != 0) {
1763 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
1764 			aiocbe->jobstate = JOBST_JOBQSYNC;
1765 			AIO_UNLOCK(ki);
1766 			goto done;
1767 		}
1768 	}
1769 	mtx_lock(&aio_job_mtx);
1770 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1771 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1772 	aio_kick_nowait(p);
1773 	mtx_unlock(&aio_job_mtx);
1774 	AIO_UNLOCK(ki);
1775 	error = 0;
1776 done:
1777 	return (error);
1778 }
1779 
1780 static void
1781 aio_kick_nowait(struct proc *userp)
1782 {
1783 	struct kaioinfo *ki = userp->p_aioinfo;
1784 	struct aioproc *aiop;
1785 
1786 	mtx_assert(&aio_job_mtx, MA_OWNED);
1787 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1788 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1789 		aiop->aioprocflags &= ~AIOP_FREE;
1790 		wakeup(aiop->aioproc);
1791 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1792 	    ((ki->kaio_active_count + num_aio_resv_start) <
1793 	    ki->kaio_maxactive_count)) {
1794 		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
1795 	}
1796 }
1797 
1798 static int
1799 aio_kick(struct proc *userp)
1800 {
1801 	struct kaioinfo *ki = userp->p_aioinfo;
1802 	struct aioproc *aiop;
1803 	int error, ret = 0;
1804 
1805 	mtx_assert(&aio_job_mtx, MA_OWNED);
1806 retryproc:
1807 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1808 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1809 		aiop->aioprocflags &= ~AIOP_FREE;
1810 		wakeup(aiop->aioproc);
1811 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1812 	    ((ki->kaio_active_count + num_aio_resv_start) <
1813 	    ki->kaio_maxactive_count)) {
1814 		num_aio_resv_start++;
1815 		mtx_unlock(&aio_job_mtx);
1816 		error = aio_newproc(&num_aio_resv_start);
1817 		mtx_lock(&aio_job_mtx);
1818 		if (error) {
1819 			num_aio_resv_start--;
1820 			goto retryproc;
1821 		}
1822 	} else {
1823 		ret = -1;
1824 	}
1825 	return (ret);
1826 }
1827 
1828 static void
1829 aio_kick_helper(void *context, int pending)
1830 {
1831 	struct proc *userp = context;
1832 
1833 	mtx_lock(&aio_job_mtx);
1834 	while (--pending >= 0) {
1835 		if (aio_kick(userp))
1836 			break;
1837 	}
1838 	mtx_unlock(&aio_job_mtx);
1839 }
1840 
1841 /*
1842  * Support the aio_return system call, as a side-effect, kernel resources are
1843  * released.
1844  */
1845 static int
1846 kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops)
1847 {
1848 	struct proc *p = td->td_proc;
1849 	struct aiocblist *cb;
1850 	struct kaioinfo *ki;
1851 	int status, error;
1852 
1853 	ki = p->p_aioinfo;
1854 	if (ki == NULL)
1855 		return (EINVAL);
1856 	AIO_LOCK(ki);
1857 	TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
1858 		if (cb->uuaiocb == uaiocb)
1859 			break;
1860 	}
1861 	if (cb != NULL) {
1862 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
1863 		status = cb->uaiocb._aiocb_private.status;
1864 		error = cb->uaiocb._aiocb_private.error;
1865 		td->td_retval[0] = status;
1866 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1867 			td->td_ru.ru_oublock += cb->outputcharge;
1868 			cb->outputcharge = 0;
1869 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1870 			td->td_ru.ru_inblock += cb->inputcharge;
1871 			cb->inputcharge = 0;
1872 		}
1873 		aio_free_entry(cb);
1874 		AIO_UNLOCK(ki);
1875 		ops->store_error(uaiocb, error);
1876 		ops->store_status(uaiocb, status);
1877 	} else {
1878 		error = EINVAL;
1879 		AIO_UNLOCK(ki);
1880 	}
1881 	return (error);
1882 }
1883 
1884 int
1885 sys_aio_return(struct thread *td, struct aio_return_args *uap)
1886 {
1887 
1888 	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
1889 }
1890 
1891 /*
1892  * Allow a process to wakeup when any of the I/O requests are completed.
1893  */
1894 static int
1895 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
1896     struct timespec *ts)
1897 {
1898 	struct proc *p = td->td_proc;
1899 	struct timeval atv;
1900 	struct kaioinfo *ki;
1901 	struct aiocblist *cb, *cbfirst;
1902 	int error, i, timo;
1903 
1904 	timo = 0;
1905 	if (ts) {
1906 		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
1907 			return (EINVAL);
1908 
1909 		TIMESPEC_TO_TIMEVAL(&atv, ts);
1910 		if (itimerfix(&atv))
1911 			return (EINVAL);
1912 		timo = tvtohz(&atv);
1913 	}
1914 
1915 	ki = p->p_aioinfo;
1916 	if (ki == NULL)
1917 		return (EAGAIN);
1918 
1919 	if (njoblist == 0)
1920 		return (0);
1921 
1922 	AIO_LOCK(ki);
1923 	for (;;) {
1924 		cbfirst = NULL;
1925 		error = 0;
1926 		TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
1927 			for (i = 0; i < njoblist; i++) {
1928 				if (cb->uuaiocb == ujoblist[i]) {
1929 					if (cbfirst == NULL)
1930 						cbfirst = cb;
1931 					if (cb->jobstate == JOBST_JOBFINISHED)
1932 						goto RETURN;
1933 				}
1934 			}
1935 		}
1936 		/* All tasks were finished. */
1937 		if (cbfirst == NULL)
1938 			break;
1939 
1940 		ki->kaio_flags |= KAIO_WAKEUP;
1941 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
1942 		    "aiospn", timo);
1943 		if (error == ERESTART)
1944 			error = EINTR;
1945 		if (error)
1946 			break;
1947 	}
1948 RETURN:
1949 	AIO_UNLOCK(ki);
1950 	return (error);
1951 }
1952 
1953 int
1954 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1955 {
1956 	struct timespec ts, *tsp;
1957 	struct aiocb **ujoblist;
1958 	int error;
1959 
1960 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
1961 		return (EINVAL);
1962 
1963 	if (uap->timeout) {
1964 		/* Get timespec struct. */
1965 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1966 			return (error);
1967 		tsp = &ts;
1968 	} else
1969 		tsp = NULL;
1970 
1971 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
1972 	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
1973 	if (error == 0)
1974 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
1975 	uma_zfree(aiol_zone, ujoblist);
1976 	return (error);
1977 }
1978 
1979 /*
1980  * aio_cancel cancels any non-physio aio operations not currently in
1981  * progress.
1982  */
1983 int
1984 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1985 {
1986 	struct proc *p = td->td_proc;
1987 	struct kaioinfo *ki;
1988 	struct aiocblist *cbe, *cbn;
1989 	struct file *fp;
1990 	struct socket *so;
1991 	cap_rights_t rights;
1992 	int error;
1993 	int remove;
1994 	int cancelled = 0;
1995 	int notcancelled = 0;
1996 	struct vnode *vp;
1997 
1998 	/* Lookup file object. */
1999 	error = fget(td, uap->fd, cap_rights_init(&rights), &fp);
2000 	if (error)
2001 		return (error);
2002 
2003 	ki = p->p_aioinfo;
2004 	if (ki == NULL)
2005 		goto done;
2006 
2007 	if (fp->f_type == DTYPE_VNODE) {
2008 		vp = fp->f_vnode;
2009 		if (vn_isdisk(vp, &error)) {
2010 			fdrop(fp, td);
2011 			td->td_retval[0] = AIO_NOTCANCELED;
2012 			return (0);
2013 		}
2014 	}
2015 
2016 	AIO_LOCK(ki);
2017 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
2018 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
2019 		    ((uap->aiocbp == NULL) ||
2020 		     (uap->aiocbp == cbe->uuaiocb))) {
2021 			remove = 0;
2022 
2023 			mtx_lock(&aio_job_mtx);
2024 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
2025 				TAILQ_REMOVE(&aio_jobs, cbe, list);
2026 				remove = 1;
2027 			} else if (cbe->jobstate == JOBST_JOBQSOCK) {
2028 				MPASS(fp->f_type == DTYPE_SOCKET);
2029 				so = fp->f_data;
2030 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
2031 				remove = 1;
2032 			} else if (cbe->jobstate == JOBST_JOBQSYNC) {
2033 				TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
2034 				remove = 1;
2035 			}
2036 			mtx_unlock(&aio_job_mtx);
2037 
2038 			if (remove) {
2039 				TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
2040 				cbe->uaiocb._aiocb_private.status = -1;
2041 				cbe->uaiocb._aiocb_private.error = ECANCELED;
2042 				aio_bio_done_notify(p, cbe, DONE_QUEUE);
2043 				cancelled++;
2044 			} else {
2045 				notcancelled++;
2046 			}
2047 			if (uap->aiocbp != NULL)
2048 				break;
2049 		}
2050 	}
2051 	AIO_UNLOCK(ki);
2052 
2053 done:
2054 	fdrop(fp, td);
2055 
2056 	if (uap->aiocbp != NULL) {
2057 		if (cancelled) {
2058 			td->td_retval[0] = AIO_CANCELED;
2059 			return (0);
2060 		}
2061 	}
2062 
2063 	if (notcancelled) {
2064 		td->td_retval[0] = AIO_NOTCANCELED;
2065 		return (0);
2066 	}
2067 
2068 	if (cancelled) {
2069 		td->td_retval[0] = AIO_CANCELED;
2070 		return (0);
2071 	}
2072 
2073 	td->td_retval[0] = AIO_ALLDONE;
2074 
2075 	return (0);
2076 }
2077 
2078 /*
2079  * aio_error is implemented in the kernel level for compatibility purposes
2080  * only.  For a user mode async implementation, it would be best to do it in
2081  * a userland subroutine.
2082  */
2083 static int
2084 kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops)
2085 {
2086 	struct proc *p = td->td_proc;
2087 	struct aiocblist *cb;
2088 	struct kaioinfo *ki;
2089 	int status;
2090 
2091 	ki = p->p_aioinfo;
2092 	if (ki == NULL) {
2093 		td->td_retval[0] = EINVAL;
2094 		return (0);
2095 	}
2096 
2097 	AIO_LOCK(ki);
2098 	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
2099 		if (cb->uuaiocb == aiocbp) {
2100 			if (cb->jobstate == JOBST_JOBFINISHED)
2101 				td->td_retval[0] =
2102 					cb->uaiocb._aiocb_private.error;
2103 			else
2104 				td->td_retval[0] = EINPROGRESS;
2105 			AIO_UNLOCK(ki);
2106 			return (0);
2107 		}
2108 	}
2109 	AIO_UNLOCK(ki);
2110 
2111 	/*
2112 	 * Hack for failure of aio_aqueue.
2113 	 */
2114 	status = ops->fetch_status(aiocbp);
2115 	if (status == -1) {
2116 		td->td_retval[0] = ops->fetch_error(aiocbp);
2117 		return (0);
2118 	}
2119 
2120 	td->td_retval[0] = EINVAL;
2121 	return (0);
2122 }
2123 
2124 int
2125 sys_aio_error(struct thread *td, struct aio_error_args *uap)
2126 {
2127 
2128 	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
2129 }
2130 
2131 /* syscall - asynchronous read from a file (REALTIME) */
2132 int
2133 sys_oaio_read(struct thread *td, struct oaio_read_args *uap)
2134 {
2135 
2136 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2137 	    &aiocb_ops_osigevent));
2138 }
2139 
2140 int
2141 sys_aio_read(struct thread *td, struct aio_read_args *uap)
2142 {
2143 
2144 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
2145 }
2146 
2147 /* syscall - asynchronous write to a file (REALTIME) */
2148 int
2149 sys_oaio_write(struct thread *td, struct oaio_write_args *uap)
2150 {
2151 
2152 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2153 	    &aiocb_ops_osigevent));
2154 }
2155 
2156 int
2157 sys_aio_write(struct thread *td, struct aio_write_args *uap)
2158 {
2159 
2160 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
2161 }
2162 
2163 int
2164 sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
2165 {
2166 
2167 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
2168 }
2169 
2170 static int
2171 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
2172     struct aiocb **acb_list, int nent, struct sigevent *sig,
2173     struct aiocb_ops *ops)
2174 {
2175 	struct proc *p = td->td_proc;
2176 	struct aiocb *iocb;
2177 	struct kaioinfo *ki;
2178 	struct aioliojob *lj;
2179 	struct kevent kev;
2180 	int error;
2181 	int nerror;
2182 	int i;
2183 
2184 	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
2185 		return (EINVAL);
2186 
2187 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2188 		return (EINVAL);
2189 
2190 	if (p->p_aioinfo == NULL)
2191 		aio_init_aioinfo(p);
2192 
2193 	ki = p->p_aioinfo;
2194 
2195 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
2196 	lj->lioj_flags = 0;
2197 	lj->lioj_count = 0;
2198 	lj->lioj_finished_count = 0;
2199 	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
2200 	ksiginfo_init(&lj->lioj_ksi);
2201 
2202 	/*
2203 	 * Setup signal.
2204 	 */
2205 	if (sig && (mode == LIO_NOWAIT)) {
2206 		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
2207 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2208 			/* Assume only new style KEVENT */
2209 			kev.filter = EVFILT_LIO;
2210 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
2211 			kev.ident = (uintptr_t)uacb_list; /* something unique */
2212 			kev.data = (intptr_t)lj;
2213 			/* pass user defined sigval data */
2214 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
2215 			error = kqfd_register(
2216 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
2217 			if (error) {
2218 				uma_zfree(aiolio_zone, lj);
2219 				return (error);
2220 			}
2221 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
2222 			;
2223 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2224 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
2225 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2226 					uma_zfree(aiolio_zone, lj);
2227 					return EINVAL;
2228 				}
2229 				lj->lioj_flags |= LIOJ_SIGNAL;
2230 		} else {
2231 			uma_zfree(aiolio_zone, lj);
2232 			return EINVAL;
2233 		}
2234 	}
2235 
2236 	AIO_LOCK(ki);
2237 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2238 	/*
2239 	 * Add extra aiocb count to avoid the lio to be freed
2240 	 * by other threads doing aio_waitcomplete or aio_return,
2241 	 * and prevent event from being sent until we have queued
2242 	 * all tasks.
2243 	 */
2244 	lj->lioj_count = 1;
2245 	AIO_UNLOCK(ki);
2246 
2247 	/*
2248 	 * Get pointers to the list of I/O requests.
2249 	 */
2250 	nerror = 0;
2251 	for (i = 0; i < nent; i++) {
2252 		iocb = acb_list[i];
2253 		if (iocb != NULL) {
2254 			error = aio_aqueue(td, iocb, lj, LIO_NOP, ops);
2255 			if (error != 0)
2256 				nerror++;
2257 		}
2258 	}
2259 
2260 	error = 0;
2261 	AIO_LOCK(ki);
2262 	if (mode == LIO_WAIT) {
2263 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
2264 			ki->kaio_flags |= KAIO_WAKEUP;
2265 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
2266 			    PRIBIO | PCATCH, "aiospn", 0);
2267 			if (error == ERESTART)
2268 				error = EINTR;
2269 			if (error)
2270 				break;
2271 		}
2272 	} else {
2273 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
2274 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2275 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
2276 				KNOTE_LOCKED(&lj->klist, 1);
2277 			}
2278 			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
2279 			    == LIOJ_SIGNAL
2280 			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2281 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
2282 				aio_sendsig(p, &lj->lioj_signal,
2283 					    &lj->lioj_ksi);
2284 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2285 			}
2286 		}
2287 	}
2288 	lj->lioj_count--;
2289 	if (lj->lioj_count == 0) {
2290 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
2291 		knlist_delete(&lj->klist, curthread, 1);
2292 		PROC_LOCK(p);
2293 		sigqueue_take(&lj->lioj_ksi);
2294 		PROC_UNLOCK(p);
2295 		AIO_UNLOCK(ki);
2296 		uma_zfree(aiolio_zone, lj);
2297 	} else
2298 		AIO_UNLOCK(ki);
2299 
2300 	if (nerror)
2301 		return (EIO);
2302 	return (error);
2303 }
2304 
2305 /* syscall - list directed I/O (REALTIME) */
2306 int
2307 sys_olio_listio(struct thread *td, struct olio_listio_args *uap)
2308 {
2309 	struct aiocb **acb_list;
2310 	struct sigevent *sigp, sig;
2311 	struct osigevent osig;
2312 	int error, nent;
2313 
2314 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2315 		return (EINVAL);
2316 
2317 	nent = uap->nent;
2318 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2319 		return (EINVAL);
2320 
2321 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2322 		error = copyin(uap->sig, &osig, sizeof(osig));
2323 		if (error)
2324 			return (error);
2325 		error = convert_old_sigevent(&osig, &sig);
2326 		if (error)
2327 			return (error);
2328 		sigp = &sig;
2329 	} else
2330 		sigp = NULL;
2331 
2332 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2333 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2334 	if (error == 0)
2335 		error = kern_lio_listio(td, uap->mode,
2336 		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2337 		    &aiocb_ops_osigevent);
2338 	free(acb_list, M_LIO);
2339 	return (error);
2340 }
2341 
2342 /* syscall - list directed I/O (REALTIME) */
2343 int
2344 sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
2345 {
2346 	struct aiocb **acb_list;
2347 	struct sigevent *sigp, sig;
2348 	int error, nent;
2349 
2350 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2351 		return (EINVAL);
2352 
2353 	nent = uap->nent;
2354 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2355 		return (EINVAL);
2356 
2357 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2358 		error = copyin(uap->sig, &sig, sizeof(sig));
2359 		if (error)
2360 			return (error);
2361 		sigp = &sig;
2362 	} else
2363 		sigp = NULL;
2364 
2365 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2366 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2367 	if (error == 0)
2368 		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
2369 		    nent, sigp, &aiocb_ops);
2370 	free(acb_list, M_LIO);
2371 	return (error);
2372 }
2373 
2374 static void
2375 aio_physwakeup(struct bio *bp)
2376 {
2377 	struct aiocblist *aiocbe = (struct aiocblist *)bp->bio_caller1;
2378 	struct proc *userp;
2379 	struct kaioinfo *ki;
2380 	int nblks;
2381 
2382 	/* Release mapping into kernel space. */
2383 	if (aiocbe->pbuf) {
2384 		pmap_qremove((vm_offset_t)aiocbe->pbuf->b_data, aiocbe->npages);
2385 		relpbuf(aiocbe->pbuf, NULL);
2386 		aiocbe->pbuf = NULL;
2387 		atomic_subtract_int(&num_buf_aio, 1);
2388 	}
2389 	vm_page_unhold_pages(aiocbe->pages, aiocbe->npages);
2390 
2391 	bp = aiocbe->bp;
2392 	aiocbe->bp = NULL;
2393 	userp = aiocbe->userproc;
2394 	ki = userp->p_aioinfo;
2395 	AIO_LOCK(ki);
2396 	aiocbe->uaiocb._aiocb_private.status -= bp->bio_resid;
2397 	aiocbe->uaiocb._aiocb_private.error = 0;
2398 	if (bp->bio_flags & BIO_ERROR)
2399 		aiocbe->uaiocb._aiocb_private.error = bp->bio_error;
2400 	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
2401 	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
2402 		aiocbe->outputcharge += nblks;
2403 	else
2404 		aiocbe->inputcharge += nblks;
2405 	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
2406 	ki->kaio_buffer_count--;
2407 	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
2408 	AIO_UNLOCK(ki);
2409 
2410 	g_destroy_bio(bp);
2411 }
2412 
2413 /* syscall - wait for the next completion of an aio request */
2414 static int
2415 kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp,
2416     struct timespec *ts, struct aiocb_ops *ops)
2417 {
2418 	struct proc *p = td->td_proc;
2419 	struct timeval atv;
2420 	struct kaioinfo *ki;
2421 	struct aiocblist *cb;
2422 	struct aiocb *uuaiocb;
2423 	int error, status, timo;
2424 
2425 	ops->store_aiocb(aiocbp, NULL);
2426 
2427 	if (ts == NULL) {
2428 		timo = 0;
2429 	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
2430 		timo = -1;
2431 	} else {
2432 		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
2433 			return (EINVAL);
2434 
2435 		TIMESPEC_TO_TIMEVAL(&atv, ts);
2436 		if (itimerfix(&atv))
2437 			return (EINVAL);
2438 		timo = tvtohz(&atv);
2439 	}
2440 
2441 	if (p->p_aioinfo == NULL)
2442 		aio_init_aioinfo(p);
2443 	ki = p->p_aioinfo;
2444 
2445 	error = 0;
2446 	cb = NULL;
2447 	AIO_LOCK(ki);
2448 	while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
2449 		if (timo == -1) {
2450 			error = EWOULDBLOCK;
2451 			break;
2452 		}
2453 		ki->kaio_flags |= KAIO_WAKEUP;
2454 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2455 		    "aiowc", timo);
2456 		if (timo && error == ERESTART)
2457 			error = EINTR;
2458 		if (error)
2459 			break;
2460 	}
2461 
2462 	if (cb != NULL) {
2463 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
2464 		uuaiocb = cb->uuaiocb;
2465 		status = cb->uaiocb._aiocb_private.status;
2466 		error = cb->uaiocb._aiocb_private.error;
2467 		td->td_retval[0] = status;
2468 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2469 			td->td_ru.ru_oublock += cb->outputcharge;
2470 			cb->outputcharge = 0;
2471 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2472 			td->td_ru.ru_inblock += cb->inputcharge;
2473 			cb->inputcharge = 0;
2474 		}
2475 		aio_free_entry(cb);
2476 		AIO_UNLOCK(ki);
2477 		ops->store_aiocb(aiocbp, uuaiocb);
2478 		ops->store_error(uuaiocb, error);
2479 		ops->store_status(uuaiocb, status);
2480 	} else
2481 		AIO_UNLOCK(ki);
2482 
2483 	return (error);
2484 }
2485 
2486 int
2487 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2488 {
2489 	struct timespec ts, *tsp;
2490 	int error;
2491 
2492 	if (uap->timeout) {
2493 		/* Get timespec struct. */
2494 		error = copyin(uap->timeout, &ts, sizeof(ts));
2495 		if (error)
2496 			return (error);
2497 		tsp = &ts;
2498 	} else
2499 		tsp = NULL;
2500 
2501 	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
2502 }
2503 
2504 static int
2505 kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
2506     struct aiocb_ops *ops)
2507 {
2508 	struct proc *p = td->td_proc;
2509 	struct kaioinfo *ki;
2510 
2511 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
2512 		return (EINVAL);
2513 	ki = p->p_aioinfo;
2514 	if (ki == NULL)
2515 		aio_init_aioinfo(p);
2516 	return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
2517 }
2518 
2519 int
2520 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
2521 {
2522 
2523 	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
2524 }
2525 
2526 /* kqueue attach function */
2527 static int
2528 filt_aioattach(struct knote *kn)
2529 {
2530 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2531 
2532 	/*
2533 	 * The aiocbe pointer must be validated before using it, so
2534 	 * registration is restricted to the kernel; the user cannot
2535 	 * set EV_FLAG1.
2536 	 */
2537 	if ((kn->kn_flags & EV_FLAG1) == 0)
2538 		return (EPERM);
2539 	kn->kn_ptr.p_aio = aiocbe;
2540 	kn->kn_flags &= ~EV_FLAG1;
2541 
2542 	knlist_add(&aiocbe->klist, kn, 0);
2543 
2544 	return (0);
2545 }
2546 
2547 /* kqueue detach function */
2548 static void
2549 filt_aiodetach(struct knote *kn)
2550 {
2551 	struct knlist *knl;
2552 
2553 	knl = &kn->kn_ptr.p_aio->klist;
2554 	knl->kl_lock(knl->kl_lockarg);
2555 	if (!knlist_empty(knl))
2556 		knlist_remove(knl, kn, 1);
2557 	knl->kl_unlock(knl->kl_lockarg);
2558 }
2559 
2560 /* kqueue filter function */
2561 /*ARGSUSED*/
2562 static int
2563 filt_aio(struct knote *kn, long hint)
2564 {
2565 	struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
2566 
2567 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2568 	if (aiocbe->jobstate != JOBST_JOBFINISHED)
2569 		return (0);
2570 	kn->kn_flags |= EV_EOF;
2571 	return (1);
2572 }
2573 
2574 /* kqueue attach function */
2575 static int
2576 filt_lioattach(struct knote *kn)
2577 {
2578 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
2579 
2580 	/*
2581 	 * The aioliojob pointer must be validated before using it, so
2582 	 * registration is restricted to the kernel; the user cannot
2583 	 * set EV_FLAG1.
2584 	 */
2585 	if ((kn->kn_flags & EV_FLAG1) == 0)
2586 		return (EPERM);
2587 	kn->kn_ptr.p_lio = lj;
2588 	kn->kn_flags &= ~EV_FLAG1;
2589 
2590 	knlist_add(&lj->klist, kn, 0);
2591 
2592 	return (0);
2593 }
2594 
2595 /* kqueue detach function */
2596 static void
2597 filt_liodetach(struct knote *kn)
2598 {
2599 	struct knlist *knl;
2600 
2601 	knl = &kn->kn_ptr.p_lio->klist;
2602 	knl->kl_lock(knl->kl_lockarg);
2603 	if (!knlist_empty(knl))
2604 		knlist_remove(knl, kn, 1);
2605 	knl->kl_unlock(knl->kl_lockarg);
2606 }
2607 
2608 /* kqueue filter function */
2609 /*ARGSUSED*/
2610 static int
2611 filt_lio(struct knote *kn, long hint)
2612 {
2613 	struct aioliojob * lj = kn->kn_ptr.p_lio;
2614 
2615 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
2616 }
2617 
2618 #ifdef COMPAT_FREEBSD32
2619 
2620 struct __aiocb_private32 {
2621 	int32_t	status;
2622 	int32_t	error;
2623 	uint32_t kernelinfo;
2624 };
2625 
2626 typedef struct oaiocb32 {
2627 	int	aio_fildes;		/* File descriptor */
2628 	uint64_t aio_offset __packed;	/* File offset for I/O */
2629 	uint32_t aio_buf;		/* I/O buffer in process space */
2630 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2631 	struct	osigevent32 aio_sigevent; /* Signal to deliver */
2632 	int	aio_lio_opcode;		/* LIO opcode */
2633 	int	aio_reqprio;		/* Request priority -- ignored */
2634 	struct	__aiocb_private32 _aiocb_private;
2635 } oaiocb32_t;
2636 
2637 typedef struct aiocb32 {
2638 	int32_t	aio_fildes;		/* File descriptor */
2639 	uint64_t aio_offset __packed;	/* File offset for I/O */
2640 	uint32_t aio_buf;		/* I/O buffer in process space */
2641 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2642 	int	__spare__[2];
2643 	uint32_t __spare2__;
2644 	int	aio_lio_opcode;		/* LIO opcode */
2645 	int	aio_reqprio;		/* Request priority -- ignored */
2646 	struct __aiocb_private32 _aiocb_private;
2647 	struct sigevent32 aio_sigevent;	/* Signal to deliver */
2648 } aiocb32_t;
2649 
2650 static int
2651 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
2652 {
2653 
2654 	/*
2655 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
2656 	 * supported by AIO with the old sigevent structure.
2657 	 */
2658 	CP(*osig, *nsig, sigev_notify);
2659 	switch (nsig->sigev_notify) {
2660 	case SIGEV_NONE:
2661 		break;
2662 	case SIGEV_SIGNAL:
2663 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
2664 		break;
2665 	case SIGEV_KEVENT:
2666 		nsig->sigev_notify_kqueue =
2667 		    osig->__sigev_u.__sigev_notify_kqueue;
2668 		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
2669 		break;
2670 	default:
2671 		return (EINVAL);
2672 	}
2673 	return (0);
2674 }
2675 
2676 static int
2677 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
2678 {
2679 	struct oaiocb32 job32;
2680 	int error;
2681 
2682 	bzero(kjob, sizeof(struct aiocb));
2683 	error = copyin(ujob, &job32, sizeof(job32));
2684 	if (error)
2685 		return (error);
2686 
2687 	CP(job32, *kjob, aio_fildes);
2688 	CP(job32, *kjob, aio_offset);
2689 	PTRIN_CP(job32, *kjob, aio_buf);
2690 	CP(job32, *kjob, aio_nbytes);
2691 	CP(job32, *kjob, aio_lio_opcode);
2692 	CP(job32, *kjob, aio_reqprio);
2693 	CP(job32, *kjob, _aiocb_private.status);
2694 	CP(job32, *kjob, _aiocb_private.error);
2695 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2696 	return (convert_old_sigevent32(&job32.aio_sigevent,
2697 	    &kjob->aio_sigevent));
2698 }
2699 
2700 static int
2701 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
2702 {
2703 	struct aiocb32 job32;
2704 	int error;
2705 
2706 	error = copyin(ujob, &job32, sizeof(job32));
2707 	if (error)
2708 		return (error);
2709 	CP(job32, *kjob, aio_fildes);
2710 	CP(job32, *kjob, aio_offset);
2711 	PTRIN_CP(job32, *kjob, aio_buf);
2712 	CP(job32, *kjob, aio_nbytes);
2713 	CP(job32, *kjob, aio_lio_opcode);
2714 	CP(job32, *kjob, aio_reqprio);
2715 	CP(job32, *kjob, _aiocb_private.status);
2716 	CP(job32, *kjob, _aiocb_private.error);
2717 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2718 	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
2719 }
2720 
2721 static long
2722 aiocb32_fetch_status(struct aiocb *ujob)
2723 {
2724 	struct aiocb32 *ujob32;
2725 
2726 	ujob32 = (struct aiocb32 *)ujob;
2727 	return (fuword32(&ujob32->_aiocb_private.status));
2728 }
2729 
2730 static long
2731 aiocb32_fetch_error(struct aiocb *ujob)
2732 {
2733 	struct aiocb32 *ujob32;
2734 
2735 	ujob32 = (struct aiocb32 *)ujob;
2736 	return (fuword32(&ujob32->_aiocb_private.error));
2737 }
2738 
2739 static int
2740 aiocb32_store_status(struct aiocb *ujob, long status)
2741 {
2742 	struct aiocb32 *ujob32;
2743 
2744 	ujob32 = (struct aiocb32 *)ujob;
2745 	return (suword32(&ujob32->_aiocb_private.status, status));
2746 }
2747 
2748 static int
2749 aiocb32_store_error(struct aiocb *ujob, long error)
2750 {
2751 	struct aiocb32 *ujob32;
2752 
2753 	ujob32 = (struct aiocb32 *)ujob;
2754 	return (suword32(&ujob32->_aiocb_private.error, error));
2755 }
2756 
2757 static int
2758 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
2759 {
2760 	struct aiocb32 *ujob32;
2761 
2762 	ujob32 = (struct aiocb32 *)ujob;
2763 	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
2764 }
2765 
2766 static int
2767 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
2768 {
2769 
2770 	return (suword32(ujobp, (long)ujob));
2771 }
2772 
2773 static struct aiocb_ops aiocb32_ops = {
2774 	.copyin = aiocb32_copyin,
2775 	.fetch_status = aiocb32_fetch_status,
2776 	.fetch_error = aiocb32_fetch_error,
2777 	.store_status = aiocb32_store_status,
2778 	.store_error = aiocb32_store_error,
2779 	.store_kernelinfo = aiocb32_store_kernelinfo,
2780 	.store_aiocb = aiocb32_store_aiocb,
2781 };
2782 
2783 static struct aiocb_ops aiocb32_ops_osigevent = {
2784 	.copyin = aiocb32_copyin_old_sigevent,
2785 	.fetch_status = aiocb32_fetch_status,
2786 	.fetch_error = aiocb32_fetch_error,
2787 	.store_status = aiocb32_store_status,
2788 	.store_error = aiocb32_store_error,
2789 	.store_kernelinfo = aiocb32_store_kernelinfo,
2790 	.store_aiocb = aiocb32_store_aiocb,
2791 };
2792 
2793 int
2794 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
2795 {
2796 
2797 	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2798 }
2799 
2800 int
2801 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
2802 {
2803 	struct timespec32 ts32;
2804 	struct timespec ts, *tsp;
2805 	struct aiocb **ujoblist;
2806 	uint32_t *ujoblist32;
2807 	int error, i;
2808 
2809 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
2810 		return (EINVAL);
2811 
2812 	if (uap->timeout) {
2813 		/* Get timespec struct. */
2814 		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
2815 			return (error);
2816 		CP(ts32, ts, tv_sec);
2817 		CP(ts32, ts, tv_nsec);
2818 		tsp = &ts;
2819 	} else
2820 		tsp = NULL;
2821 
2822 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
2823 	ujoblist32 = (uint32_t *)ujoblist;
2824 	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
2825 	    sizeof(ujoblist32[0]));
2826 	if (error == 0) {
2827 		for (i = uap->nent; i > 0; i--)
2828 			ujoblist[i] = PTRIN(ujoblist32[i]);
2829 
2830 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2831 	}
2832 	uma_zfree(aiol_zone, ujoblist);
2833 	return (error);
2834 }
2835 
2836 int
2837 freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap)
2838 {
2839 
2840 	return (sys_aio_cancel(td, (struct aio_cancel_args *)uap));
2841 }
2842 
2843 int
2844 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
2845 {
2846 
2847 	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2848 }
2849 
2850 int
2851 freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap)
2852 {
2853 
2854 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2855 	    &aiocb32_ops_osigevent));
2856 }
2857 
2858 int
2859 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
2860 {
2861 
2862 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2863 	    &aiocb32_ops));
2864 }
2865 
2866 int
2867 freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap)
2868 {
2869 
2870 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2871 	    &aiocb32_ops_osigevent));
2872 }
2873 
2874 int
2875 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
2876 {
2877 
2878 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2879 	    &aiocb32_ops));
2880 }
2881 
2882 int
2883 freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
2884 {
2885 
2886 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
2887 	    &aiocb32_ops));
2888 }
2889 
2890 int
2891 freebsd32_aio_waitcomplete(struct thread *td,
2892     struct freebsd32_aio_waitcomplete_args *uap)
2893 {
2894 	struct timespec32 ts32;
2895 	struct timespec ts, *tsp;
2896 	int error;
2897 
2898 	if (uap->timeout) {
2899 		/* Get timespec struct. */
2900 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
2901 		if (error)
2902 			return (error);
2903 		CP(ts32, ts, tv_sec);
2904 		CP(ts32, ts, tv_nsec);
2905 		tsp = &ts;
2906 	} else
2907 		tsp = NULL;
2908 
2909 	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
2910 	    &aiocb32_ops));
2911 }
2912 
2913 int
2914 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
2915 {
2916 
2917 	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
2918 	    &aiocb32_ops));
2919 }
2920 
2921 int
2922 freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap)
2923 {
2924 	struct aiocb **acb_list;
2925 	struct sigevent *sigp, sig;
2926 	struct osigevent32 osig;
2927 	uint32_t *acb_list32;
2928 	int error, i, nent;
2929 
2930 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2931 		return (EINVAL);
2932 
2933 	nent = uap->nent;
2934 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2935 		return (EINVAL);
2936 
2937 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2938 		error = copyin(uap->sig, &osig, sizeof(osig));
2939 		if (error)
2940 			return (error);
2941 		error = convert_old_sigevent32(&osig, &sig);
2942 		if (error)
2943 			return (error);
2944 		sigp = &sig;
2945 	} else
2946 		sigp = NULL;
2947 
2948 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
2949 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
2950 	if (error) {
2951 		free(acb_list32, M_LIO);
2952 		return (error);
2953 	}
2954 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2955 	for (i = 0; i < nent; i++)
2956 		acb_list[i] = PTRIN(acb_list32[i]);
2957 	free(acb_list32, M_LIO);
2958 
2959 	error = kern_lio_listio(td, uap->mode,
2960 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2961 	    &aiocb32_ops_osigevent);
2962 	free(acb_list, M_LIO);
2963 	return (error);
2964 }
2965 
2966 int
2967 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
2968 {
2969 	struct aiocb **acb_list;
2970 	struct sigevent *sigp, sig;
2971 	struct sigevent32 sig32;
2972 	uint32_t *acb_list32;
2973 	int error, i, nent;
2974 
2975 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2976 		return (EINVAL);
2977 
2978 	nent = uap->nent;
2979 	if (nent < 0 || nent > AIO_LISTIO_MAX)
2980 		return (EINVAL);
2981 
2982 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2983 		error = copyin(uap->sig, &sig32, sizeof(sig32));
2984 		if (error)
2985 			return (error);
2986 		error = convert_sigevent32(&sig32, &sig);
2987 		if (error)
2988 			return (error);
2989 		sigp = &sig;
2990 	} else
2991 		sigp = NULL;
2992 
2993 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
2994 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
2995 	if (error) {
2996 		free(acb_list32, M_LIO);
2997 		return (error);
2998 	}
2999 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
3000 	for (i = 0; i < nent; i++)
3001 		acb_list[i] = PTRIN(acb_list32[i]);
3002 	free(acb_list32, M_LIO);
3003 
3004 	error = kern_lio_listio(td, uap->mode,
3005 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
3006 	    &aiocb32_ops);
3007 	free(acb_list, M_LIO);
3008 	return (error);
3009 }
3010 
3011 #endif
3012