xref: /freebsd/sys/kern/vfs_aio.c (revision 71fe318b852b8dfb3e799cb12ef184750f7f8eac)
1 /*
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  *
16  * $FreeBSD$
17  */
18 
19 /*
20  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21  */
22 
23 #include <sys/param.h>
24 #include <sys/systm.h>
25 #include <sys/malloc.h>
26 #include <sys/bio.h>
27 #include <sys/buf.h>
28 #include <sys/sysproto.h>
29 #include <sys/filedesc.h>
30 #include <sys/kernel.h>
31 #include <sys/kthread.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/lock.h>
35 #include <sys/mutex.h>
36 #include <sys/unistd.h>
37 #include <sys/proc.h>
38 #include <sys/resourcevar.h>
39 #include <sys/signalvar.h>
40 #include <sys/protosw.h>
41 #include <sys/socketvar.h>
42 #include <sys/syscall.h>
43 #include <sys/sysent.h>
44 #include <sys/sysctl.h>
45 #include <sys/sx.h>
46 #include <sys/vnode.h>
47 #include <sys/conf.h>
48 #include <sys/event.h>
49 
50 #include <vm/vm.h>
51 #include <vm/vm_extern.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_map.h>
54 #include <vm/uma.h>
55 #include <sys/aio.h>
56 
57 #include <machine/limits.h>
58 
59 #include "opt_vfs_aio.h"
60 
61 /*
62  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
63  * overflow.
64  */
65 static	long jobrefid;
66 
67 #define JOBST_NULL		0x0
68 #define JOBST_JOBQGLOBAL	0x2
69 #define JOBST_JOBRUNNING	0x3
70 #define JOBST_JOBFINISHED	0x4
71 #define	JOBST_JOBQBUF		0x5
72 #define	JOBST_JOBBFINISHED	0x6
73 
74 #ifndef MAX_AIO_PER_PROC
75 #define MAX_AIO_PER_PROC	32
76 #endif
77 
78 #ifndef MAX_AIO_QUEUE_PER_PROC
79 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
80 #endif
81 
82 #ifndef MAX_AIO_PROCS
83 #define MAX_AIO_PROCS		32
84 #endif
85 
86 #ifndef MAX_AIO_QUEUE
87 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
88 #endif
89 
90 #ifndef TARGET_AIO_PROCS
91 #define TARGET_AIO_PROCS	4
92 #endif
93 
94 #ifndef MAX_BUF_AIO
95 #define MAX_BUF_AIO		16
96 #endif
97 
98 #ifndef AIOD_TIMEOUT_DEFAULT
99 #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
100 #endif
101 
102 #ifndef AIOD_LIFETIME_DEFAULT
103 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
104 #endif
105 
106 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
107 
108 static int max_aio_procs = MAX_AIO_PROCS;
109 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
110 	CTLFLAG_RW, &max_aio_procs, 0,
111 	"Maximum number of kernel threads to use for handling async IO ");
112 
113 static int num_aio_procs = 0;
114 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
115 	CTLFLAG_RD, &num_aio_procs, 0,
116 	"Number of presently active kernel threads for async IO");
117 
118 /*
119  * The code will adjust the actual number of AIO processes towards this
120  * number when it gets a chance.
121  */
122 static int target_aio_procs = TARGET_AIO_PROCS;
123 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
124 	0, "Preferred number of ready kernel threads for async IO");
125 
126 static int max_queue_count = MAX_AIO_QUEUE;
127 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
128     "Maximum number of aio requests to queue, globally");
129 
130 static int num_queue_count = 0;
131 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
132     "Number of queued aio requests");
133 
134 static int num_buf_aio = 0;
135 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
136     "Number of aio requests presently handled by the buf subsystem");
137 
138 /* Number of async I/O thread in the process of being started */
139 /* XXX This should be local to _aio_aqueue() */
140 static int num_aio_resv_start = 0;
141 
142 static int aiod_timeout;
143 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
144     "Timeout value for synchronous aio operations");
145 
146 static int aiod_lifetime;
147 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
148     "Maximum lifetime for idle aiod");
149 
150 static int unloadable = 0;
151 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
152     "Allow unload of aio (not recommended)");
153 
154 
155 static int max_aio_per_proc = MAX_AIO_PER_PROC;
156 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
157     0, "Maximum active aio requests per process (stored in the process)");
158 
159 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
160 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
161     &max_aio_queue_per_proc, 0,
162     "Maximum queued aio requests per process (stored in the process)");
163 
164 static int max_buf_aio = MAX_BUF_AIO;
165 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
166     "Maximum buf aio requests per process (stored in the process)");
167 
168 struct aiocblist {
169         TAILQ_ENTRY(aiocblist) list;	/* List of jobs */
170         TAILQ_ENTRY(aiocblist) plist;	/* List of jobs for proc */
171         int	jobflags;
172         int	jobstate;
173 	int	inputcharge;
174 	int	outputcharge;
175 	struct	callout_handle timeouthandle;
176         struct	buf *bp;		/* Buffer pointer */
177         struct	proc *userproc;		/* User process */ /* Not td! */
178         struct	file *fd_file;		/* Pointer to file structure */
179         struct	aio_liojob *lio;	/* Optional lio job */
180         struct	aiocb *uuaiocb;		/* Pointer in userspace of aiocb */
181 	struct	klist klist;		/* list of knotes */
182         struct	aiocb uaiocb;		/* Kernel I/O control block */
183 };
184 
185 /* jobflags */
186 #define AIOCBLIST_RUNDOWN       0x4
187 #define AIOCBLIST_DONE          0x10
188 
189 /*
190  * AIO process info
191  */
192 #define AIOP_FREE	0x1			/* proc on free queue */
193 #define AIOP_SCHED	0x2			/* proc explicitly scheduled */
194 
195 struct aiothreadlist {
196 	int aiothreadflags;			/* AIO proc flags */
197 	TAILQ_ENTRY(aiothreadlist) list;	/* List of processes */
198 	struct thread *aiothread;		/* The AIO thread */
199 };
200 
201 /*
202  * data-structure for lio signal management
203  */
204 struct aio_liojob {
205 	int	lioj_flags;
206 	int	lioj_buffer_count;
207 	int	lioj_buffer_finished_count;
208 	int	lioj_queue_count;
209 	int	lioj_queue_finished_count;
210 	struct	sigevent lioj_signal;	/* signal on all I/O done */
211 	TAILQ_ENTRY(aio_liojob) lioj_list;
212 	struct	kaioinfo *lioj_ki;
213 };
214 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
215 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
216 
217 /*
218  * per process aio data structure
219  */
220 struct kaioinfo {
221 	int	kaio_flags;		/* per process kaio flags */
222 	int	kaio_maxactive_count;	/* maximum number of AIOs */
223 	int	kaio_active_count;	/* number of currently used AIOs */
224 	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
225 	int	kaio_queue_count;	/* size of AIO queue */
226 	int	kaio_ballowed_count;	/* maximum number of buffers */
227 	int	kaio_queue_finished_count; /* number of daemon jobs finished */
228 	int	kaio_buffer_count;	/* number of physio buffers */
229 	int	kaio_buffer_finished_count; /* count of I/O done */
230 	struct 	proc *kaio_p;		/* process that uses this kaio block */
231 	TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
232 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* job queue for process */
233 	TAILQ_HEAD(,aiocblist) kaio_jobdone;	/* done queue for process */
234 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* buffer job queue for process */
235 	TAILQ_HEAD(,aiocblist) kaio_bufdone;	/* buffer done queue for process */
236 	TAILQ_HEAD(,aiocblist) kaio_sockqueue;	/* queue for aios waiting on sockets */
237 };
238 
239 #define KAIO_RUNDOWN	0x1	/* process is being run down */
240 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
241 
242 static TAILQ_HEAD(,aiothreadlist) aio_activeproc;	/* Active daemons */
243 static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* Idle daemons */
244 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
245 static TAILQ_HEAD(,aiocblist) aio_bufjobs;		/* Phys I/O job list */
246 
247 static void	aio_init_aioinfo(struct proc *p);
248 static void	aio_onceonly(void);
249 static int	aio_free_entry(struct aiocblist *aiocbe);
250 static void	aio_process(struct aiocblist *aiocbe);
251 static int	aio_newproc(void);
252 static int	aio_aqueue(struct thread *td, struct aiocb *job, int type);
253 static void	aio_physwakeup(struct buf *bp);
254 static void	aio_proc_rundown(struct proc *p);
255 static int	aio_fphysio(struct aiocblist *aiocbe);
256 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
257 static void	aio_daemon(void *uproc);
258 static void	aio_swake_cb(struct socket *, struct sockbuf *);
259 static int	aio_unload(void);
260 static void	process_signal(void *aioj);
261 static int	filt_aioattach(struct knote *kn);
262 static void	filt_aiodetach(struct knote *kn);
263 static int	filt_aio(struct knote *kn, long hint);
264 
265 /*
266  * Zones for:
267  * 	kaio	Per process async io info
268  *	aiop	async io thread data
269  *	aiocb	async io jobs
270  *	aiol	list io job pointer - internal to aio_suspend XXX
271  *	aiolio	list io jobs
272  */
273 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
274 
275 /* kqueue filters for aio */
276 static struct filterops aio_filtops =
277 	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
278 
279 /*
280  * Main operations function for use as a kernel module.
281  */
282 static int
283 aio_modload(struct module *module, int cmd, void *arg)
284 {
285 	int error = 0;
286 
287 	switch (cmd) {
288 	case MOD_LOAD:
289 		aio_onceonly();
290 		break;
291 	case MOD_UNLOAD:
292 		error = aio_unload();
293 		break;
294 	case MOD_SHUTDOWN:
295 		break;
296 	default:
297 		error = EINVAL;
298 		break;
299 	}
300 	return (error);
301 }
302 
303 static moduledata_t aio_mod = {
304 	"aio",
305 	&aio_modload,
306 	NULL
307 };
308 
309 SYSCALL_MODULE_HELPER(aio_return);
310 SYSCALL_MODULE_HELPER(aio_suspend);
311 SYSCALL_MODULE_HELPER(aio_cancel);
312 SYSCALL_MODULE_HELPER(aio_error);
313 SYSCALL_MODULE_HELPER(aio_read);
314 SYSCALL_MODULE_HELPER(aio_write);
315 SYSCALL_MODULE_HELPER(aio_waitcomplete);
316 SYSCALL_MODULE_HELPER(lio_listio);
317 
318 DECLARE_MODULE(aio, aio_mod,
319 	SI_SUB_VFS, SI_ORDER_ANY);
320 MODULE_VERSION(aio, 1);
321 
322 /*
323  * Startup initialization
324  */
325 static void
326 aio_onceonly(void)
327 {
328 
329 	/* XXX: should probably just use so->callback */
330 	aio_swake = &aio_swake_cb;
331 	at_exit(aio_proc_rundown);
332 	at_exec(aio_proc_rundown);
333 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
334 	TAILQ_INIT(&aio_freeproc);
335 	TAILQ_INIT(&aio_activeproc);
336 	TAILQ_INIT(&aio_jobs);
337 	TAILQ_INIT(&aio_bufjobs);
338 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
339 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
340 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
341 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
342 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
343 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
344 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
345 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
346 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aio_liojob), NULL,
347 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
348 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
349 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
350 	jobrefid = 1;
351 	async_io_version = _POSIX_VERSION;
352 }
353 
354 /*
355  * Callback for unload of AIO when used as a module.
356  */
357 static int
358 aio_unload(void)
359 {
360 
361 	/*
362 	 * XXX: no unloads by default, it's too dangerous.
363 	 * perhaps we could do it if locked out callers and then
364 	 * did an aio_proc_rundown() on each process.
365 	 */
366 	if (!unloadable)
367 		return (EOPNOTSUPP);
368 
369 	async_io_version = 0;
370 	aio_swake = NULL;
371 	rm_at_exit(aio_proc_rundown);
372 	rm_at_exec(aio_proc_rundown);
373 	kqueue_del_filteropts(EVFILT_AIO);
374 	return (0);
375 }
376 
377 /*
378  * Init the per-process aioinfo structure.  The aioinfo limits are set
379  * per-process for user limit (resource) management.
380  */
381 static void
382 aio_init_aioinfo(struct proc *p)
383 {
384 	struct kaioinfo *ki;
385 	if (p->p_aioinfo == NULL) {
386 		ki = uma_zalloc(kaio_zone, M_WAITOK);
387 		p->p_aioinfo = ki;
388 		ki->kaio_flags = 0;
389 		ki->kaio_maxactive_count = max_aio_per_proc;
390 		ki->kaio_active_count = 0;
391 		ki->kaio_qallowed_count = max_aio_queue_per_proc;
392 		ki->kaio_queue_count = 0;
393 		ki->kaio_ballowed_count = max_buf_aio;
394 		ki->kaio_buffer_count = 0;
395 		ki->kaio_buffer_finished_count = 0;
396 		ki->kaio_p = p;
397 		TAILQ_INIT(&ki->kaio_jobdone);
398 		TAILQ_INIT(&ki->kaio_jobqueue);
399 		TAILQ_INIT(&ki->kaio_bufdone);
400 		TAILQ_INIT(&ki->kaio_bufqueue);
401 		TAILQ_INIT(&ki->kaio_liojoblist);
402 		TAILQ_INIT(&ki->kaio_sockqueue);
403 	}
404 
405 	while (num_aio_procs < target_aio_procs)
406 		aio_newproc();
407 }
408 
409 /*
410  * Free a job entry.  Wait for completion if it is currently active, but don't
411  * delay forever.  If we delay, we return a flag that says that we have to
412  * restart the queue scan.
413  */
414 static int
415 aio_free_entry(struct aiocblist *aiocbe)
416 {
417 	struct kaioinfo *ki;
418 	struct aio_liojob *lj;
419 	struct proc *p;
420 	int error;
421 	int s;
422 
423 	if (aiocbe->jobstate == JOBST_NULL)
424 		panic("aio_free_entry: freeing already free job");
425 
426 	p = aiocbe->userproc;
427 	ki = p->p_aioinfo;
428 	lj = aiocbe->lio;
429 	if (ki == NULL)
430 		panic("aio_free_entry: missing p->p_aioinfo");
431 
432 	while (aiocbe->jobstate == JOBST_JOBRUNNING) {
433 		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
434 		tsleep(aiocbe, PRIBIO, "jobwai", 0);
435 	}
436 	if (aiocbe->bp == NULL) {
437 		if (ki->kaio_queue_count <= 0)
438 			panic("aio_free_entry: process queue size <= 0");
439 		if (num_queue_count <= 0)
440 			panic("aio_free_entry: system wide queue size <= 0");
441 
442 		if (lj) {
443 			lj->lioj_queue_count--;
444 			if (aiocbe->jobflags & AIOCBLIST_DONE)
445 				lj->lioj_queue_finished_count--;
446 		}
447 		ki->kaio_queue_count--;
448 		if (aiocbe->jobflags & AIOCBLIST_DONE)
449 			ki->kaio_queue_finished_count--;
450 		num_queue_count--;
451 	} else {
452 		if (lj) {
453 			lj->lioj_buffer_count--;
454 			if (aiocbe->jobflags & AIOCBLIST_DONE)
455 				lj->lioj_buffer_finished_count--;
456 		}
457 		if (aiocbe->jobflags & AIOCBLIST_DONE)
458 			ki->kaio_buffer_finished_count--;
459 		ki->kaio_buffer_count--;
460 		num_buf_aio--;
461 	}
462 
463 	/* aiocbe is going away, we need to destroy any knotes */
464 	/* XXXKSE Note the thread here is used to eventually find the
465 	 * owning process again, but it is also used to do a fo_close
466 	 * and that requires the thread. (but does it require the
467 	 * OWNING thread? (or maybe the running thread?)
468 	 * There is a semantic problem here...
469 	 */
470 	knote_remove(FIRST_THREAD_IN_PROC(p), &aiocbe->klist); /* XXXKSE */
471 
472 	if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
473 	    && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
474 		ki->kaio_flags &= ~KAIO_WAKEUP;
475 		wakeup(p);
476 	}
477 
478 	if (aiocbe->jobstate == JOBST_JOBQBUF) {
479 		if ((error = aio_fphysio(aiocbe)) != 0)
480 			return error;
481 		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
482 			panic("aio_free_entry: invalid physio finish-up state");
483 		s = splbio();
484 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
485 		splx(s);
486 	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
487 		s = splnet();
488 		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
489 		TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
490 		splx(s);
491 	} else if (aiocbe->jobstate == JOBST_JOBFINISHED)
492 		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
493 	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
494 		s = splbio();
495 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
496 		splx(s);
497 		if (aiocbe->bp) {
498 			vunmapbuf(aiocbe->bp);
499 			relpbuf(aiocbe->bp, NULL);
500 			aiocbe->bp = NULL;
501 		}
502 	}
503 	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
504 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
505 		uma_zfree(aiolio_zone, lj);
506 	}
507 	aiocbe->jobstate = JOBST_NULL;
508 	untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
509 	fdrop(aiocbe->fd_file, curthread);
510 	uma_zfree(aiocb_zone, aiocbe);
511 	return 0;
512 }
513 
514 /*
515  * Rundown the jobs for a given process.
516  */
517 static void
518 aio_proc_rundown(struct proc *p)
519 {
520 	int s;
521 	struct kaioinfo *ki;
522 	struct aio_liojob *lj, *ljn;
523 	struct aiocblist *aiocbe, *aiocbn;
524 	struct file *fp;
525 	struct socket *so;
526 
527 	ki = p->p_aioinfo;
528 	if (ki == NULL)
529 		return;
530 
531 	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
532 	while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
533 	    ki->kaio_buffer_finished_count)) {
534 		ki->kaio_flags |= KAIO_RUNDOWN;
535 		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
536 			break;
537 	}
538 
539 	/*
540 	 * Move any aio ops that are waiting on socket I/O to the normal job
541 	 * queues so they are cleaned up with any others.
542 	 */
543 	s = splnet();
544 	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
545 	    aiocbn) {
546 		aiocbn = TAILQ_NEXT(aiocbe, plist);
547 		fp = aiocbe->fd_file;
548 		if (fp != NULL) {
549 			so = (struct socket *)fp->f_data;
550 			TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
551 			if (TAILQ_EMPTY(&so->so_aiojobq)) {
552 				so->so_snd.sb_flags &= ~SB_AIO;
553 				so->so_rcv.sb_flags &= ~SB_AIO;
554 			}
555 		}
556 		TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
557 		TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
558 		TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
559 	}
560 	splx(s);
561 
562 restart1:
563 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
564 		aiocbn = TAILQ_NEXT(aiocbe, plist);
565 		if (aio_free_entry(aiocbe))
566 			goto restart1;
567 	}
568 
569 restart2:
570 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
571 	    aiocbn) {
572 		aiocbn = TAILQ_NEXT(aiocbe, plist);
573 		if (aio_free_entry(aiocbe))
574 			goto restart2;
575 	}
576 
577 /*
578  * Note the use of lots of splbio here, trying to avoid splbio for long chains
579  * of I/O.  Probably unnecessary.
580  */
581 restart3:
582 	s = splbio();
583 	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
584 		ki->kaio_flags |= KAIO_WAKEUP;
585 		tsleep(p, PRIBIO, "aioprn", 0);
586 		splx(s);
587 		goto restart3;
588 	}
589 	splx(s);
590 
591 restart4:
592 	s = splbio();
593 	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
594 		aiocbn = TAILQ_NEXT(aiocbe, plist);
595 		if (aio_free_entry(aiocbe)) {
596 			splx(s);
597 			goto restart4;
598 		}
599 	}
600 	splx(s);
601 
602         /*
603          * If we've slept, jobs might have moved from one queue to another.
604          * Retry rundown if we didn't manage to empty the queues.
605          */
606         if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
607 	    TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
608 	    TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
609 	    TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
610 		goto restart1;
611 
612 	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
613 		ljn = TAILQ_NEXT(lj, lioj_list);
614 		if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
615 		    0)) {
616 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
617 			uma_zfree(aiolio_zone, lj);
618 		} else {
619 #ifdef DIAGNOSTIC
620 			printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
621 			    "QF:%d\n", lj->lioj_buffer_count,
622 			    lj->lioj_buffer_finished_count,
623 			    lj->lioj_queue_count,
624 			    lj->lioj_queue_finished_count);
625 #endif
626 		}
627 	}
628 
629 	uma_zfree(kaio_zone, ki);
630 	p->p_aioinfo = NULL;
631 }
632 
633 /*
634  * Select a job to run (called by an AIO daemon).
635  */
636 static struct aiocblist *
637 aio_selectjob(struct aiothreadlist *aiop)
638 {
639 	int s;
640 	struct aiocblist *aiocbe;
641 	struct kaioinfo *ki;
642 	struct proc *userp;
643 
644 	s = splnet();
645 	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
646 	    TAILQ_NEXT(aiocbe, list)) {
647 		userp = aiocbe->userproc;
648 		ki = userp->p_aioinfo;
649 
650 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
651 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
652 			splx(s);
653 			return aiocbe;
654 		}
655 	}
656 	splx(s);
657 
658 	return NULL;
659 }
660 
661 /*
662  * The AIO processing activity.  This is the code that does the I/O request for
663  * the non-physio version of the operations.  The normal vn operations are used,
664  * and this code should work in all instances for every type of file, including
665  * pipes, sockets, fifos, and regular files.
666  */
667 static void
668 aio_process(struct aiocblist *aiocbe)
669 {
670 	struct thread *td;
671 	struct proc *mycp;
672 	struct aiocb *cb;
673 	struct file *fp;
674 	struct uio auio;
675 	struct iovec aiov;
676 	int cnt;
677 	int error;
678 	int oublock_st, oublock_end;
679 	int inblock_st, inblock_end;
680 
681 	td = curthread;
682 	mycp = td->td_proc;
683 	cb = &aiocbe->uaiocb;
684 	fp = aiocbe->fd_file;
685 
686 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
687 	aiov.iov_len = cb->aio_nbytes;
688 
689 	auio.uio_iov = &aiov;
690 	auio.uio_iovcnt = 1;
691 	auio.uio_offset = cb->aio_offset;
692 	auio.uio_resid = cb->aio_nbytes;
693 	cnt = cb->aio_nbytes;
694 	auio.uio_segflg = UIO_USERSPACE;
695 	auio.uio_td = td;
696 
697 	inblock_st = mycp->p_stats->p_ru.ru_inblock;
698 	oublock_st = mycp->p_stats->p_ru.ru_oublock;
699 	/*
700 	 * _aio_aqueue() acquires a reference to the file that is
701 	 * released in aio_free_entry().
702 	 */
703 	if (cb->aio_lio_opcode == LIO_READ) {
704 		auio.uio_rw = UIO_READ;
705 		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
706 	} else {
707 		auio.uio_rw = UIO_WRITE;
708 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
709 	}
710 	inblock_end = mycp->p_stats->p_ru.ru_inblock;
711 	oublock_end = mycp->p_stats->p_ru.ru_oublock;
712 
713 	aiocbe->inputcharge = inblock_end - inblock_st;
714 	aiocbe->outputcharge = oublock_end - oublock_st;
715 
716 	if ((error) && (auio.uio_resid != cnt)) {
717 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
718 			error = 0;
719 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
720 			PROC_LOCK(aiocbe->userproc);
721 			psignal(aiocbe->userproc, SIGPIPE);
722 			PROC_UNLOCK(aiocbe->userproc);
723 		}
724 	}
725 
726 	cnt -= auio.uio_resid;
727 	cb->_aiocb_private.error = error;
728 	cb->_aiocb_private.status = cnt;
729 }
730 
731 /*
732  * The AIO daemon, most of the actual work is done in aio_process,
733  * but the setup (and address space mgmt) is done in this routine.
734  */
735 static void
736 aio_daemon(void *uproc)
737 {
738 	int s;
739 	struct aio_liojob *lj;
740 	struct aiocb *cb;
741 	struct aiocblist *aiocbe;
742 	struct aiothreadlist *aiop;
743 	struct kaioinfo *ki;
744 	struct proc *curcp, *mycp, *userp;
745 	struct vmspace *myvm, *tmpvm;
746 	struct thread *td = curthread;
747 	struct pgrp *newpgrp;
748 	struct session *newsess;
749 
750 	mtx_lock(&Giant);
751 	/*
752 	 * Local copies of curproc (cp) and vmspace (myvm)
753 	 */
754 	mycp = td->td_proc;
755 	myvm = mycp->p_vmspace;
756 
757 	if (mycp->p_textvp) {
758 		vrele(mycp->p_textvp);
759 		mycp->p_textvp = NULL;
760 	}
761 
762 	/*
763 	 * Allocate and ready the aio control info.  There is one aiop structure
764 	 * per daemon.
765 	 */
766 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
767 	aiop->aiothread = td;
768 	aiop->aiothreadflags |= AIOP_FREE;
769 
770 	s = splnet();
771 
772 	/*
773 	 * Place thread (lightweight process) onto the AIO free thread list.
774 	 */
775 	if (TAILQ_EMPTY(&aio_freeproc))
776 		wakeup(&aio_freeproc);
777 	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
778 
779 	splx(s);
780 
781 	/*
782 	 * Get rid of our current filedescriptors.  AIOD's don't need any
783 	 * filedescriptors, except as temporarily inherited from the client.
784 	 */
785 	fdfree(td);
786 
787 	mtx_unlock(&Giant);
788 	/* The daemon resides in its own pgrp. */
789 	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP,
790 		M_WAITOK | M_ZERO);
791 	MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION,
792 		M_WAITOK | M_ZERO);
793 
794 	sx_xlock(&proctree_lock);
795 	enterpgrp(mycp, mycp->p_pid, newpgrp, newsess);
796 	sx_xunlock(&proctree_lock);
797 	mtx_lock(&Giant);
798 
799 	/* Mark special process type. */
800 	mycp->p_flag |= P_SYSTEM;
801 
802 	/*
803 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
804 	 * and creating too many daemons.)
805 	 */
806 	wakeup(mycp);
807 
808 	for (;;) {
809 		/*
810 		 * curcp is the current daemon process context.
811 		 * userp is the current user process context.
812 		 */
813 		curcp = mycp;
814 
815 		/*
816 		 * Take daemon off of free queue
817 		 */
818 		if (aiop->aiothreadflags & AIOP_FREE) {
819 			s = splnet();
820 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
821 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
822 			aiop->aiothreadflags &= ~AIOP_FREE;
823 			splx(s);
824 		}
825 		aiop->aiothreadflags &= ~AIOP_SCHED;
826 
827 		/*
828 		 * Check for jobs.
829 		 */
830 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
831 			cb = &aiocbe->uaiocb;
832 			userp = aiocbe->userproc;
833 
834 			aiocbe->jobstate = JOBST_JOBRUNNING;
835 
836 			/*
837 			 * Connect to process address space for user program.
838 			 */
839 			if (userp != curcp) {
840 				/*
841 				 * Save the current address space that we are
842 				 * connected to.
843 				 */
844 				tmpvm = mycp->p_vmspace;
845 
846 				/*
847 				 * Point to the new user address space, and
848 				 * refer to it.
849 				 */
850 				mycp->p_vmspace = userp->p_vmspace;
851 				mycp->p_vmspace->vm_refcnt++;
852 
853 				/* Activate the new mapping. */
854 				pmap_activate(FIRST_THREAD_IN_PROC(mycp));
855 
856 				/*
857 				 * If the old address space wasn't the daemons
858 				 * own address space, then we need to remove the
859 				 * daemon's reference from the other process
860 				 * that it was acting on behalf of.
861 				 */
862 				if (tmpvm != myvm) {
863 					vmspace_free(tmpvm);
864 				}
865 				curcp = userp;
866 			}
867 
868 			ki = userp->p_aioinfo;
869 			lj = aiocbe->lio;
870 
871 			/* Account for currently active jobs. */
872 			ki->kaio_active_count++;
873 
874 			/* Do the I/O function. */
875 			aio_process(aiocbe);
876 
877 			/* Decrement the active job count. */
878 			ki->kaio_active_count--;
879 
880 			/*
881 			 * Increment the completion count for wakeup/signal
882 			 * comparisons.
883 			 */
884 			aiocbe->jobflags |= AIOCBLIST_DONE;
885 			ki->kaio_queue_finished_count++;
886 			if (lj)
887 				lj->lioj_queue_finished_count++;
888 			if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
889 			    & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
890 				ki->kaio_flags &= ~KAIO_WAKEUP;
891 				wakeup(userp);
892 			}
893 
894 			s = splbio();
895 			if (lj && (lj->lioj_flags &
896 			    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
897 				if ((lj->lioj_queue_finished_count ==
898 				    lj->lioj_queue_count) &&
899 				    (lj->lioj_buffer_finished_count ==
900 				    lj->lioj_buffer_count)) {
901 					PROC_LOCK(userp);
902 					psignal(userp,
903 					    lj->lioj_signal.sigev_signo);
904 					PROC_UNLOCK(userp);
905 					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
906 				}
907 			}
908 			splx(s);
909 
910 			aiocbe->jobstate = JOBST_JOBFINISHED;
911 
912 			s = splnet();
913 			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
914 			TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist);
915 			splx(s);
916 			KNOTE(&aiocbe->klist, 0);
917 
918 			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
919 				wakeup(aiocbe);
920 				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
921 			}
922 
923 			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
924 				PROC_LOCK(userp);
925 				psignal(userp, cb->aio_sigevent.sigev_signo);
926 				PROC_UNLOCK(userp);
927 			}
928 		}
929 
930 		/*
931 		 * Disconnect from user address space.
932 		 */
933 		if (curcp != mycp) {
934 			/* Get the user address space to disconnect from. */
935 			tmpvm = mycp->p_vmspace;
936 
937 			/* Get original address space for daemon. */
938 			mycp->p_vmspace = myvm;
939 
940 			/* Activate the daemon's address space. */
941 			pmap_activate(FIRST_THREAD_IN_PROC(mycp));
942 #ifdef DIAGNOSTIC
943 			if (tmpvm == myvm) {
944 				printf("AIOD: vmspace problem -- %d\n",
945 				    mycp->p_pid);
946 			}
947 #endif
948 			/* Remove our vmspace reference. */
949 			vmspace_free(tmpvm);
950 
951 			curcp = mycp;
952 		}
953 
954 		/*
955 		 * If we are the first to be put onto the free queue, wakeup
956 		 * anyone waiting for a daemon.
957 		 */
958 		s = splnet();
959 		TAILQ_REMOVE(&aio_activeproc, aiop, list);
960 		if (TAILQ_EMPTY(&aio_freeproc))
961 			wakeup(&aio_freeproc);
962 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
963 		aiop->aiothreadflags |= AIOP_FREE;
964 		splx(s);
965 
966 		/*
967 		 * If daemon is inactive for a long time, allow it to exit,
968 		 * thereby freeing resources.
969 		 */
970 		if ((aiop->aiothreadflags & AIOP_SCHED) == 0 &&
971 		    tsleep(aiop->aiothread, PRIBIO, "aiordy", aiod_lifetime)) {
972 			s = splnet();
973 			if (TAILQ_EMPTY(&aio_jobs)) {
974 				if ((aiop->aiothreadflags & AIOP_FREE) &&
975 				    (num_aio_procs > target_aio_procs)) {
976 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
977 					splx(s);
978 					uma_zfree(aiop_zone, aiop);
979 					num_aio_procs--;
980 #ifdef DIAGNOSTIC
981 					if (mycp->p_vmspace->vm_refcnt <= 1) {
982 						printf("AIOD: bad vm refcnt for"
983 						    " exiting daemon: %d\n",
984 						    mycp->p_vmspace->vm_refcnt);
985 					}
986 #endif
987 					kthread_exit(0);
988 				}
989 			}
990 			splx(s);
991 		}
992 	}
993 }
994 
995 /*
996  * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
997  * AIO daemon modifies its environment itself.
998  */
999 static int
1000 aio_newproc()
1001 {
1002 	int error;
1003 	struct proc *p;
1004 
1005 	error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, 0, "aiod%d",
1006 			       num_aio_procs);
1007 	if (error)
1008 		return error;
1009 
1010 	/*
1011 	 * Wait until daemon is started, but continue on just in case to
1012 	 * handle error conditions.
1013 	 */
1014 	error = tsleep(p, PZERO, "aiosta", aiod_timeout);
1015 
1016 	num_aio_procs++;
1017 
1018 	return error;
1019 }
1020 
1021 /*
1022  * Try the high-performance, low-overhead physio method for eligible
1023  * VCHR devices.  This method doesn't use an aio helper thread, and
1024  * thus has very low overhead.
1025  *
1026  * Assumes that the caller, _aio_aqueue(), has incremented the file
1027  * structure's reference count, preventing its deallocation for the
1028  * duration of this call.
1029  */
1030 static int
1031 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1032 {
1033 	int error;
1034 	struct aiocb *cb;
1035 	struct file *fp;
1036 	struct buf *bp;
1037 	struct vnode *vp;
1038 	struct kaioinfo *ki;
1039 	struct aio_liojob *lj;
1040 	int s;
1041 	int notify;
1042 
1043 	cb = &aiocbe->uaiocb;
1044 	fp = aiocbe->fd_file;
1045 
1046 	if (fp->f_type != DTYPE_VNODE)
1047 		return (-1);
1048 
1049 	vp = (struct vnode *)fp->f_data;
1050 
1051 	/*
1052 	 * If its not a disk, we don't want to return a positive error.
1053 	 * It causes the aio code to not fall through to try the thread
1054 	 * way when you're talking to a regular file.
1055 	 */
1056 	if (!vn_isdisk(vp, &error)) {
1057 		if (error == ENOTBLK)
1058 			return (-1);
1059 		else
1060 			return (error);
1061 	}
1062 
1063  	if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
1064 		return (-1);
1065 
1066 	if (cb->aio_nbytes >
1067 	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1068 		return (-1);
1069 
1070 	ki = p->p_aioinfo;
1071 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1072 		return (-1);
1073 
1074 	ki->kaio_buffer_count++;
1075 
1076 	lj = aiocbe->lio;
1077 	if (lj)
1078 		lj->lioj_buffer_count++;
1079 
1080 	/* Create and build a buffer header for a transfer. */
1081 	bp = (struct buf *)getpbuf(NULL);
1082 	BUF_KERNPROC(bp);
1083 
1084 	/*
1085 	 * Get a copy of the kva from the physical buffer.
1086 	 */
1087 	bp->b_caller1 = p;
1088 	bp->b_dev = vp->v_rdev;
1089 	error = bp->b_error = 0;
1090 
1091 	bp->b_bcount = cb->aio_nbytes;
1092 	bp->b_bufsize = cb->aio_nbytes;
1093 	bp->b_flags = B_PHYS;
1094 	bp->b_iodone = aio_physwakeup;
1095 	bp->b_saveaddr = bp->b_data;
1096 	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1097 	bp->b_blkno = btodb(cb->aio_offset);
1098 
1099 	if (cb->aio_lio_opcode == LIO_WRITE) {
1100 		bp->b_iocmd = BIO_WRITE;
1101 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1102 			error = EFAULT;
1103 			goto doerror;
1104 		}
1105 	} else {
1106 		bp->b_iocmd = BIO_READ;
1107 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1108 			error = EFAULT;
1109 			goto doerror;
1110 		}
1111 	}
1112 
1113 	/* Bring buffer into kernel space. */
1114 	vmapbuf(bp);
1115 
1116 	s = splbio();
1117 	aiocbe->bp = bp;
1118 	bp->b_spc = (void *)aiocbe;
1119 	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1120 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1121 	aiocbe->jobstate = JOBST_JOBQBUF;
1122 	cb->_aiocb_private.status = cb->aio_nbytes;
1123 	num_buf_aio++;
1124 	bp->b_error = 0;
1125 
1126 	splx(s);
1127 
1128 	/* Perform transfer. */
1129 	DEV_STRATEGY(bp, 0);
1130 
1131 	notify = 0;
1132 	s = splbio();
1133 
1134 	/*
1135 	 * If we had an error invoking the request, or an error in processing
1136 	 * the request before we have returned, we process it as an error in
1137 	 * transfer.  Note that such an I/O error is not indicated immediately,
1138 	 * but is returned using the aio_error mechanism.  In this case,
1139 	 * aio_suspend will return immediately.
1140 	 */
1141 	if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1142 		struct aiocb *job = aiocbe->uuaiocb;
1143 
1144 		aiocbe->uaiocb._aiocb_private.status = 0;
1145 		suword(&job->_aiocb_private.status, 0);
1146 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1147 		suword(&job->_aiocb_private.error, bp->b_error);
1148 
1149 		ki->kaio_buffer_finished_count++;
1150 
1151 		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1152 			aiocbe->jobstate = JOBST_JOBBFINISHED;
1153 			aiocbe->jobflags |= AIOCBLIST_DONE;
1154 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1155 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1156 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1157 			notify = 1;
1158 		}
1159 	}
1160 	splx(s);
1161 	if (notify)
1162 		KNOTE(&aiocbe->klist, 0);
1163 	return 0;
1164 
1165 doerror:
1166 	ki->kaio_buffer_count--;
1167 	if (lj)
1168 		lj->lioj_buffer_count--;
1169 	aiocbe->bp = NULL;
1170 	relpbuf(bp, NULL);
1171 	return error;
1172 }
1173 
1174 /*
1175  * This waits/tests physio completion.
1176  */
1177 static int
1178 aio_fphysio(struct aiocblist *iocb)
1179 {
1180 	int s;
1181 	struct buf *bp;
1182 	int error;
1183 
1184 	bp = iocb->bp;
1185 
1186 	s = splbio();
1187 	while ((bp->b_flags & B_DONE) == 0) {
1188 		if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
1189 			if ((bp->b_flags & B_DONE) == 0) {
1190 				splx(s);
1191 				return EINPROGRESS;
1192 			} else
1193 				break;
1194 		}
1195 	}
1196 	splx(s);
1197 
1198 	/* Release mapping into kernel space. */
1199 	vunmapbuf(bp);
1200 	iocb->bp = 0;
1201 
1202 	error = 0;
1203 
1204 	/* Check for an error. */
1205 	if (bp->b_ioflags & BIO_ERROR)
1206 		error = bp->b_error;
1207 
1208 	relpbuf(bp, NULL);
1209 	return (error);
1210 }
1211 
1212 /*
1213  * Wake up aio requests that may be serviceable now.
1214  */
1215 static void
1216 aio_swake_cb(struct socket *so, struct sockbuf *sb)
1217 {
1218 	struct aiocblist *cb,*cbn;
1219 	struct proc *p;
1220 	struct kaioinfo *ki = NULL;
1221 	int opcode, wakecount = 0;
1222 	struct aiothreadlist *aiop;
1223 
1224 	if (sb == &so->so_snd) {
1225 		opcode = LIO_WRITE;
1226 		so->so_snd.sb_flags &= ~SB_AIO;
1227 	} else {
1228 		opcode = LIO_READ;
1229 		so->so_rcv.sb_flags &= ~SB_AIO;
1230 	}
1231 
1232 	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1233 		cbn = TAILQ_NEXT(cb, list);
1234 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1235 			p = cb->userproc;
1236 			ki = p->p_aioinfo;
1237 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1238 			TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1239 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1240 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1241 			wakecount++;
1242 			if (cb->jobstate != JOBST_JOBQGLOBAL)
1243 				panic("invalid queue value");
1244 		}
1245 	}
1246 
1247 	while (wakecount--) {
1248 		if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1249 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1250 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1251 			aiop->aiothreadflags &= ~AIOP_FREE;
1252 			wakeup(aiop->aiothread);
1253 		}
1254 	}
1255 }
1256 
1257 /*
1258  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1259  * technique is done in this code.
1260  */
1261 static int
1262 _aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
1263 {
1264 	struct proc *p = td->td_proc;
1265 	struct filedesc *fdp;
1266 	struct file *fp;
1267 	unsigned int fd;
1268 	struct socket *so;
1269 	int s;
1270 	int error;
1271 	int opcode, user_opcode;
1272 	struct aiocblist *aiocbe;
1273 	struct aiothreadlist *aiop;
1274 	struct kaioinfo *ki;
1275 	struct kevent kev;
1276 	struct kqueue *kq;
1277 	struct file *kq_fp;
1278 
1279 	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK);
1280 	aiocbe->inputcharge = 0;
1281 	aiocbe->outputcharge = 0;
1282 	callout_handle_init(&aiocbe->timeouthandle);
1283 	SLIST_INIT(&aiocbe->klist);
1284 
1285 	suword(&job->_aiocb_private.status, -1);
1286 	suword(&job->_aiocb_private.error, 0);
1287 	suword(&job->_aiocb_private.kernelinfo, -1);
1288 
1289 	error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1290 	if (error) {
1291 		suword(&job->_aiocb_private.error, error);
1292 		uma_zfree(aiocb_zone, aiocbe);
1293 		return error;
1294 	}
1295 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1296 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1297 		uma_zfree(aiocb_zone, aiocbe);
1298 		return EINVAL;
1299 	}
1300 
1301 	/* Save userspace address of the job info. */
1302 	aiocbe->uuaiocb = job;
1303 
1304 	/* Get the opcode. */
1305 	user_opcode = aiocbe->uaiocb.aio_lio_opcode;
1306 	if (type != LIO_NOP)
1307 		aiocbe->uaiocb.aio_lio_opcode = type;
1308 	opcode = aiocbe->uaiocb.aio_lio_opcode;
1309 
1310 	/* Get the fd info for process. */
1311 	fdp = p->p_fd;
1312 
1313 	/*
1314 	 * Range check file descriptor.
1315 	 */
1316 	fd = aiocbe->uaiocb.aio_fildes;
1317 	if (fd >= fdp->fd_nfiles) {
1318 		uma_zfree(aiocb_zone, aiocbe);
1319 		if (type == 0)
1320 			suword(&job->_aiocb_private.error, EBADF);
1321 		return EBADF;
1322 	}
1323 
1324 	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1325 	if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1326 	    0))) {
1327 		uma_zfree(aiocb_zone, aiocbe);
1328 		if (type == 0)
1329 			suword(&job->_aiocb_private.error, EBADF);
1330 		return EBADF;
1331 	}
1332 	fhold(fp);
1333 
1334 	if (aiocbe->uaiocb.aio_offset == -1LL) {
1335 		error = EINVAL;
1336 		goto aqueue_fail;
1337 	}
1338 	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1339 	if (error) {
1340 		error = EINVAL;
1341 		goto aqueue_fail;
1342 	}
1343 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1344 	if (jobrefid == LONG_MAX)
1345 		jobrefid = 1;
1346 	else
1347 		jobrefid++;
1348 
1349 	if (opcode == LIO_NOP) {
1350 		fdrop(fp, td);
1351 		uma_zfree(aiocb_zone, aiocbe);
1352 		if (type == 0) {
1353 			suword(&job->_aiocb_private.error, 0);
1354 			suword(&job->_aiocb_private.status, 0);
1355 			suword(&job->_aiocb_private.kernelinfo, 0);
1356 		}
1357 		return 0;
1358 	}
1359 	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1360 		if (type == 0)
1361 			suword(&job->_aiocb_private.status, 0);
1362 		error = EINVAL;
1363 		goto aqueue_fail;
1364 	}
1365 
1366 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1367 		kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1368 		kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1369 	}
1370 	else {
1371 		/*
1372 		 * This method for requesting kevent-based notification won't
1373 		 * work on the alpha, since we're passing in a pointer
1374 		 * via aio_lio_opcode, which is an int.  Use the SIGEV_KEVENT-
1375 		 * based method instead.
1376 		 */
1377 		if (user_opcode == LIO_NOP || user_opcode == LIO_READ ||
1378 		    user_opcode == LIO_WRITE)
1379 			goto no_kqueue;
1380 
1381 		error = copyin((struct kevent *)(uintptr_t)user_opcode,
1382 		    &kev, sizeof(kev));
1383 		if (error)
1384 			goto aqueue_fail;
1385 	}
1386 	if ((u_int)kev.ident >= fdp->fd_nfiles ||
1387 	    (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1388 	    (kq_fp->f_type != DTYPE_KQUEUE)) {
1389 		error = EBADF;
1390 		goto aqueue_fail;
1391 	}
1392 	kq = (struct kqueue *)kq_fp->f_data;
1393 	kev.ident = (uintptr_t)aiocbe->uuaiocb;
1394 	kev.filter = EVFILT_AIO;
1395 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1396 	kev.data = (intptr_t)aiocbe;
1397 	error = kqueue_register(kq, &kev, td);
1398 aqueue_fail:
1399 	if (error) {
1400 		fdrop(fp, td);
1401 		uma_zfree(aiocb_zone, aiocbe);
1402 		if (type == 0)
1403 			suword(&job->_aiocb_private.error, error);
1404 		goto done;
1405 	}
1406 no_kqueue:
1407 
1408 	suword(&job->_aiocb_private.error, EINPROGRESS);
1409 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1410 	aiocbe->userproc = p;
1411 	aiocbe->jobflags = 0;
1412 	aiocbe->lio = lj;
1413 	ki = p->p_aioinfo;
1414 
1415 	if (fp->f_type == DTYPE_SOCKET) {
1416 		/*
1417 		 * Alternate queueing for socket ops: Reach down into the
1418 		 * descriptor to get the socket data.  Then check to see if the
1419 		 * socket is ready to be read or written (based on the requested
1420 		 * operation).
1421 		 *
1422 		 * If it is not ready for io, then queue the aiocbe on the
1423 		 * socket, and set the flags so we get a call when sbnotify()
1424 		 * happens.
1425 		 */
1426 		so = (struct socket *)fp->f_data;
1427 		s = splnet();
1428 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1429 		    LIO_WRITE) && (!sowriteable(so)))) {
1430 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1431 			TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1432 			if (opcode == LIO_READ)
1433 				so->so_rcv.sb_flags |= SB_AIO;
1434 			else
1435 				so->so_snd.sb_flags |= SB_AIO;
1436 			aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1437 			ki->kaio_queue_count++;
1438 			num_queue_count++;
1439 			splx(s);
1440 			error = 0;
1441 			goto done;
1442 		}
1443 		splx(s);
1444 	}
1445 
1446 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1447 		goto done;
1448 	if (error > 0) {
1449 		suword(&job->_aiocb_private.status, 0);
1450 		aiocbe->uaiocb._aiocb_private.error = error;
1451 		suword(&job->_aiocb_private.error, error);
1452 		goto done;
1453 	}
1454 
1455 	/* No buffer for daemon I/O. */
1456 	aiocbe->bp = NULL;
1457 
1458 	ki->kaio_queue_count++;
1459 	if (lj)
1460 		lj->lioj_queue_count++;
1461 	s = splnet();
1462 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1463 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1464 	splx(s);
1465 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1466 
1467 	num_queue_count++;
1468 	error = 0;
1469 
1470 	/*
1471 	 * If we don't have a free AIO process, and we are below our quota, then
1472 	 * start one.  Otherwise, depend on the subsequent I/O completions to
1473 	 * pick-up this job.  If we don't sucessfully create the new process
1474 	 * (thread) due to resource issues, we return an error for now (EAGAIN),
1475 	 * which is likely not the correct thing to do.
1476 	 */
1477 	s = splnet();
1478 retryproc:
1479 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1480 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1481 		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1482 		aiop->aiothreadflags &= ~AIOP_FREE;
1483 		wakeup(aiop->aiothread);
1484 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1485 	    ((ki->kaio_active_count + num_aio_resv_start) <
1486 	    ki->kaio_maxactive_count)) {
1487 		num_aio_resv_start++;
1488 		if ((error = aio_newproc()) == 0) {
1489 			num_aio_resv_start--;
1490 			goto retryproc;
1491 		}
1492 		num_aio_resv_start--;
1493 	}
1494 	splx(s);
1495 done:
1496 	return error;
1497 }
1498 
1499 /*
1500  * This routine queues an AIO request, checking for quotas.
1501  */
1502 static int
1503 aio_aqueue(struct thread *td, struct aiocb *job, int type)
1504 {
1505 	struct proc *p = td->td_proc;
1506 	struct kaioinfo *ki;
1507 
1508 	if (p->p_aioinfo == NULL)
1509 		aio_init_aioinfo(p);
1510 
1511 	if (num_queue_count >= max_queue_count)
1512 		return EAGAIN;
1513 
1514 	ki = p->p_aioinfo;
1515 	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1516 		return EAGAIN;
1517 
1518 	return _aio_aqueue(td, job, NULL, type);
1519 }
1520 
1521 /*
1522  * Support the aio_return system call, as a side-effect, kernel resources are
1523  * released.
1524  */
1525 int
1526 aio_return(struct thread *td, struct aio_return_args *uap)
1527 {
1528 	struct proc *p = td->td_proc;
1529 	int s;
1530 	long jobref;
1531 	struct aiocblist *cb, *ncb;
1532 	struct aiocb *ujob;
1533 	struct kaioinfo *ki;
1534 
1535 	ujob = uap->aiocbp;
1536 	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1537 	if (jobref == -1 || jobref == 0)
1538 		return EINVAL;
1539 
1540 	ki = p->p_aioinfo;
1541 	if (ki == NULL)
1542 		return EINVAL;
1543 	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1544 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1545 		    jobref) {
1546 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1547 				p->p_stats->p_ru.ru_oublock +=
1548 				    cb->outputcharge;
1549 				cb->outputcharge = 0;
1550 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1551 				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
1552 				cb->inputcharge = 0;
1553 			}
1554 			goto done;
1555 		}
1556 	}
1557 	s = splbio();
1558 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1559 		ncb = TAILQ_NEXT(cb, plist);
1560 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1561 		    == jobref) {
1562 			break;
1563 		}
1564 	}
1565 	splx(s);
1566  done:
1567 	if (cb != NULL) {
1568 		if (ujob == cb->uuaiocb) {
1569 			td->td_retval[0] =
1570 			    cb->uaiocb._aiocb_private.status;
1571 		} else
1572 			td->td_retval[0] = EFAULT;
1573 		aio_free_entry(cb);
1574 		return (0);
1575 	}
1576 	return (EINVAL);
1577 }
1578 
1579 /*
1580  * Allow a process to wakeup when any of the I/O requests are completed.
1581  */
1582 int
1583 aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1584 {
1585 	struct proc *p = td->td_proc;
1586 	struct timeval atv;
1587 	struct timespec ts;
1588 	struct aiocb *const *cbptr, *cbp;
1589 	struct kaioinfo *ki;
1590 	struct aiocblist *cb;
1591 	int i;
1592 	int njoblist;
1593 	int error, s, timo;
1594 	long *ijoblist;
1595 	struct aiocb **ujoblist;
1596 
1597 	if (uap->nent > AIO_LISTIO_MAX)
1598 		return EINVAL;
1599 
1600 	timo = 0;
1601 	if (uap->timeout) {
1602 		/* Get timespec struct. */
1603 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1604 			return error;
1605 
1606 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1607 			return (EINVAL);
1608 
1609 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
1610 		if (itimerfix(&atv))
1611 			return (EINVAL);
1612 		timo = tvtohz(&atv);
1613 	}
1614 
1615 	ki = p->p_aioinfo;
1616 	if (ki == NULL)
1617 		return EAGAIN;
1618 
1619 	njoblist = 0;
1620 	ijoblist = uma_zalloc(aiol_zone, M_WAITOK);
1621 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
1622 	cbptr = uap->aiocbp;
1623 
1624 	for (i = 0; i < uap->nent; i++) {
1625 		cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
1626 		if (cbp == 0)
1627 			continue;
1628 		ujoblist[njoblist] = cbp;
1629 		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1630 		njoblist++;
1631 	}
1632 
1633 	if (njoblist == 0) {
1634 		uma_zfree(aiol_zone, ijoblist);
1635 		uma_zfree(aiol_zone, ujoblist);
1636 		return 0;
1637 	}
1638 
1639 	error = 0;
1640 	for (;;) {
1641 		TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1642 			for (i = 0; i < njoblist; i++) {
1643 				if (((intptr_t)
1644 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1645 				    ijoblist[i]) {
1646 					if (ujoblist[i] != cb->uuaiocb)
1647 						error = EINVAL;
1648 					uma_zfree(aiol_zone, ijoblist);
1649 					uma_zfree(aiol_zone, ujoblist);
1650 					return error;
1651 				}
1652 			}
1653 		}
1654 
1655 		s = splbio();
1656 		for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1657 		    TAILQ_NEXT(cb, plist)) {
1658 			for (i = 0; i < njoblist; i++) {
1659 				if (((intptr_t)
1660 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1661 				    ijoblist[i]) {
1662 					splx(s);
1663 					if (ujoblist[i] != cb->uuaiocb)
1664 						error = EINVAL;
1665 					uma_zfree(aiol_zone, ijoblist);
1666 					uma_zfree(aiol_zone, ujoblist);
1667 					return error;
1668 				}
1669 			}
1670 		}
1671 
1672 		ki->kaio_flags |= KAIO_WAKEUP;
1673 		error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1674 		splx(s);
1675 
1676 		if (error == ERESTART || error == EINTR) {
1677 			uma_zfree(aiol_zone, ijoblist);
1678 			uma_zfree(aiol_zone, ujoblist);
1679 			return EINTR;
1680 		} else if (error == EWOULDBLOCK) {
1681 			uma_zfree(aiol_zone, ijoblist);
1682 			uma_zfree(aiol_zone, ujoblist);
1683 			return EAGAIN;
1684 		}
1685 	}
1686 
1687 /* NOTREACHED */
1688 	return EINVAL;
1689 }
1690 
1691 /*
1692  * aio_cancel cancels any non-physio aio operations not currently in
1693  * progress.
1694  */
1695 int
1696 aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1697 {
1698 	struct proc *p = td->td_proc;
1699 	struct kaioinfo *ki;
1700 	struct aiocblist *cbe, *cbn;
1701 	struct file *fp;
1702 	struct filedesc *fdp;
1703 	struct socket *so;
1704 	struct proc *po;
1705 	int s,error;
1706 	int cancelled=0;
1707 	int notcancelled=0;
1708 	struct vnode *vp;
1709 
1710 	fdp = p->p_fd;
1711 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
1712 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
1713 		return (EBADF);
1714 
1715         if (fp->f_type == DTYPE_VNODE) {
1716 		vp = (struct vnode *)fp->f_data;
1717 
1718 		if (vn_isdisk(vp,&error)) {
1719 			td->td_retval[0] = AIO_NOTCANCELED;
1720         	        return 0;
1721 		}
1722 	} else if (fp->f_type == DTYPE_SOCKET) {
1723 		so = (struct socket *)fp->f_data;
1724 
1725 		s = splnet();
1726 
1727 		for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1728 			cbn = TAILQ_NEXT(cbe, list);
1729 			if ((uap->aiocbp == NULL) ||
1730 				(uap->aiocbp == cbe->uuaiocb) ) {
1731 				po = cbe->userproc;
1732 				ki = po->p_aioinfo;
1733 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1734 				TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1735 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1736 				if (ki->kaio_flags & KAIO_WAKEUP) {
1737 					wakeup(po);
1738 				}
1739 				cbe->jobstate = JOBST_JOBFINISHED;
1740 				cbe->uaiocb._aiocb_private.status=-1;
1741 				cbe->uaiocb._aiocb_private.error=ECANCELED;
1742 				cancelled++;
1743 /* XXX cancelled, knote? */
1744 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1745 				    SIGEV_SIGNAL) {
1746 					PROC_LOCK(cbe->userproc);
1747 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1748 					PROC_UNLOCK(cbe->userproc);
1749 				}
1750 				if (uap->aiocbp)
1751 					break;
1752 			}
1753 		}
1754 		splx(s);
1755 
1756 		if ((cancelled) && (uap->aiocbp)) {
1757 			td->td_retval[0] = AIO_CANCELED;
1758 			return 0;
1759 		}
1760 	}
1761 	ki=p->p_aioinfo;
1762 	if (ki == NULL)
1763 		goto done;
1764 	s = splnet();
1765 
1766 	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1767 		cbn = TAILQ_NEXT(cbe, plist);
1768 
1769 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1770 		    ((uap->aiocbp == NULL ) ||
1771 		     (uap->aiocbp == cbe->uuaiocb))) {
1772 
1773 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1774 				TAILQ_REMOVE(&aio_jobs, cbe, list);
1775                                 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1776                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1777                                     plist);
1778 				cancelled++;
1779 				ki->kaio_queue_finished_count++;
1780 				cbe->jobstate = JOBST_JOBFINISHED;
1781 				cbe->uaiocb._aiocb_private.status = -1;
1782 				cbe->uaiocb._aiocb_private.error = ECANCELED;
1783 /* XXX cancelled, knote? */
1784 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1785 				    SIGEV_SIGNAL) {
1786 					PROC_LOCK(cbe->userproc);
1787 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1788 					PROC_UNLOCK(cbe->userproc);
1789 				}
1790 			} else {
1791 				notcancelled++;
1792 			}
1793 		}
1794 	}
1795 	splx(s);
1796 done:
1797 	if (notcancelled) {
1798 		td->td_retval[0] = AIO_NOTCANCELED;
1799 		return 0;
1800 	}
1801 	if (cancelled) {
1802 		td->td_retval[0] = AIO_CANCELED;
1803 		return 0;
1804 	}
1805 	td->td_retval[0] = AIO_ALLDONE;
1806 
1807 	return 0;
1808 }
1809 
1810 /*
1811  * aio_error is implemented in the kernel level for compatibility purposes only.
1812  * For a user mode async implementation, it would be best to do it in a userland
1813  * subroutine.
1814  */
1815 int
1816 aio_error(struct thread *td, struct aio_error_args *uap)
1817 {
1818 	struct proc *p = td->td_proc;
1819 	int s;
1820 	struct aiocblist *cb;
1821 	struct kaioinfo *ki;
1822 	long jobref;
1823 
1824 	ki = p->p_aioinfo;
1825 	if (ki == NULL)
1826 		return EINVAL;
1827 
1828 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1829 	if ((jobref == -1) || (jobref == 0))
1830 		return EINVAL;
1831 
1832 	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1833 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1834 		    jobref) {
1835 			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1836 			return 0;
1837 		}
1838 	}
1839 
1840 	s = splnet();
1841 
1842 	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1843 	    plist)) {
1844 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1845 		    jobref) {
1846 			td->td_retval[0] = EINPROGRESS;
1847 			splx(s);
1848 			return 0;
1849 		}
1850 	}
1851 
1852 	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1853 	    plist)) {
1854 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1855 		    jobref) {
1856 			td->td_retval[0] = EINPROGRESS;
1857 			splx(s);
1858 			return 0;
1859 		}
1860 	}
1861 	splx(s);
1862 
1863 	s = splbio();
1864 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1865 	    plist)) {
1866 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1867 		    jobref) {
1868 			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1869 			splx(s);
1870 			return 0;
1871 		}
1872 	}
1873 
1874 	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1875 	    plist)) {
1876 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1877 		    jobref) {
1878 			td->td_retval[0] = EINPROGRESS;
1879 			splx(s);
1880 			return 0;
1881 		}
1882 	}
1883 	splx(s);
1884 
1885 #if (0)
1886 	/*
1887 	 * Hack for lio.
1888 	 */
1889 	status = fuword(&uap->aiocbp->_aiocb_private.status);
1890 	if (status == -1)
1891 		return fuword(&uap->aiocbp->_aiocb_private.error);
1892 #endif
1893 	return EINVAL;
1894 }
1895 
1896 /* syscall - asynchronous read from a file (REALTIME) */
1897 int
1898 aio_read(struct thread *td, struct aio_read_args *uap)
1899 {
1900 
1901 	return aio_aqueue(td, uap->aiocbp, LIO_READ);
1902 }
1903 
1904 /* syscall - asynchronous write to a file (REALTIME) */
1905 int
1906 aio_write(struct thread *td, struct aio_write_args *uap)
1907 {
1908 
1909 	return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
1910 }
1911 
1912 /* syscall - XXX undocumented */
1913 int
1914 lio_listio(struct thread *td, struct lio_listio_args *uap)
1915 {
1916 	struct proc *p = td->td_proc;
1917 	int nent, nentqueued;
1918 	struct aiocb *iocb, * const *cbptr;
1919 	struct aiocblist *cb;
1920 	struct kaioinfo *ki;
1921 	struct aio_liojob *lj;
1922 	int error, runningcode;
1923 	int nerror;
1924 	int i;
1925 	int s;
1926 
1927 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1928 		return EINVAL;
1929 
1930 	nent = uap->nent;
1931 	if (nent > AIO_LISTIO_MAX)
1932 		return EINVAL;
1933 
1934 	if (p->p_aioinfo == NULL)
1935 		aio_init_aioinfo(p);
1936 
1937 	if ((nent + num_queue_count) > max_queue_count)
1938 		return EAGAIN;
1939 
1940 	ki = p->p_aioinfo;
1941 	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1942 		return EAGAIN;
1943 
1944 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
1945 	if (!lj)
1946 		return EAGAIN;
1947 
1948 	lj->lioj_flags = 0;
1949 	lj->lioj_buffer_count = 0;
1950 	lj->lioj_buffer_finished_count = 0;
1951 	lj->lioj_queue_count = 0;
1952 	lj->lioj_queue_finished_count = 0;
1953 	lj->lioj_ki = ki;
1954 
1955 	/*
1956 	 * Setup signal.
1957 	 */
1958 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1959 		error = copyin(uap->sig, &lj->lioj_signal,
1960 			       sizeof(lj->lioj_signal));
1961 		if (error) {
1962 			uma_zfree(aiolio_zone, lj);
1963 			return error;
1964 		}
1965 		if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
1966 			uma_zfree(aiolio_zone, lj);
1967 			return EINVAL;
1968 		}
1969 		lj->lioj_flags |= LIOJ_SIGNAL;
1970 	}
1971 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
1972 	/*
1973 	 * Get pointers to the list of I/O requests.
1974 	 */
1975 	nerror = 0;
1976 	nentqueued = 0;
1977 	cbptr = uap->acb_list;
1978 	for (i = 0; i < uap->nent; i++) {
1979 		iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
1980 		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
1981 			error = _aio_aqueue(td, iocb, lj, 0);
1982 			if (error == 0)
1983 				nentqueued++;
1984 			else
1985 				nerror++;
1986 		}
1987 	}
1988 
1989 	/*
1990 	 * If we haven't queued any, then just return error.
1991 	 */
1992 	if (nentqueued == 0)
1993 		return 0;
1994 
1995 	/*
1996 	 * Calculate the appropriate error return.
1997 	 */
1998 	runningcode = 0;
1999 	if (nerror)
2000 		runningcode = EIO;
2001 
2002 	if (uap->mode == LIO_WAIT) {
2003 		int command, found, jobref;
2004 
2005 		for (;;) {
2006 			found = 0;
2007 			for (i = 0; i < uap->nent; i++) {
2008 				/*
2009 				 * Fetch address of the control buf pointer in
2010 				 * user space.
2011 				 */
2012 				iocb = (struct aiocb *)
2013 				    (intptr_t)fuword(&cbptr[i]);
2014 				if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2015 				    == 0))
2016 					continue;
2017 
2018 				/*
2019 				 * Fetch the associated command from user space.
2020 				 */
2021 				command = fuword(&iocb->aio_lio_opcode);
2022 				if (command == LIO_NOP) {
2023 					found++;
2024 					continue;
2025 				}
2026 
2027 				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2028 
2029 				TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2030 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2031 					    == jobref) {
2032 						if (cb->uaiocb.aio_lio_opcode
2033 						    == LIO_WRITE) {
2034 							p->p_stats->p_ru.ru_oublock
2035 							    +=
2036 							    cb->outputcharge;
2037 							cb->outputcharge = 0;
2038 						} else if (cb->uaiocb.aio_lio_opcode
2039 						    == LIO_READ) {
2040 							p->p_stats->p_ru.ru_inblock
2041 							    += cb->inputcharge;
2042 							cb->inputcharge = 0;
2043 						}
2044 						found++;
2045 						break;
2046 					}
2047 				}
2048 
2049 				s = splbio();
2050 				TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2051 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2052 					    == jobref) {
2053 						found++;
2054 						break;
2055 					}
2056 				}
2057 				splx(s);
2058 			}
2059 
2060 			/*
2061 			 * If all I/Os have been disposed of, then we can
2062 			 * return.
2063 			 */
2064 			if (found == nentqueued)
2065 				return runningcode;
2066 
2067 			ki->kaio_flags |= KAIO_WAKEUP;
2068 			error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2069 
2070 			if (error == EINTR)
2071 				return EINTR;
2072 			else if (error == EWOULDBLOCK)
2073 				return EAGAIN;
2074 		}
2075 	}
2076 
2077 	return runningcode;
2078 }
2079 
2080 /*
2081  * This is a weird hack so that we can post a signal.  It is safe to do so from
2082  * a timeout routine, but *not* from an interrupt routine.
2083  */
2084 static void
2085 process_signal(void *aioj)
2086 {
2087 	struct aiocblist *aiocbe = aioj;
2088 	struct aio_liojob *lj = aiocbe->lio;
2089 	struct aiocb *cb = &aiocbe->uaiocb;
2090 
2091 	if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2092 		(lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2093 		PROC_LOCK(lj->lioj_ki->kaio_p);
2094 		psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2095 		PROC_UNLOCK(lj->lioj_ki->kaio_p);
2096 		lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2097 	}
2098 
2099 	if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2100 		PROC_LOCK(aiocbe->userproc);
2101 		psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2102 		PROC_UNLOCK(aiocbe->userproc);
2103 	}
2104 }
2105 
2106 /*
2107  * Interrupt handler for physio, performs the necessary process wakeups, and
2108  * signals.
2109  */
2110 static void
2111 aio_physwakeup(struct buf *bp)
2112 {
2113 	struct aiocblist *aiocbe;
2114 	struct proc *p;
2115 	struct kaioinfo *ki;
2116 	struct aio_liojob *lj;
2117 
2118 	wakeup(bp);
2119 
2120 	aiocbe = (struct aiocblist *)bp->b_spc;
2121 	if (aiocbe) {
2122 		p = bp->b_caller1;
2123 
2124 		aiocbe->jobstate = JOBST_JOBBFINISHED;
2125 		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2126 		aiocbe->uaiocb._aiocb_private.error = 0;
2127 		aiocbe->jobflags |= AIOCBLIST_DONE;
2128 
2129 		if (bp->b_ioflags & BIO_ERROR)
2130 			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2131 
2132 		lj = aiocbe->lio;
2133 		if (lj) {
2134 			lj->lioj_buffer_finished_count++;
2135 
2136 			/*
2137 			 * wakeup/signal if all of the interrupt jobs are done.
2138 			 */
2139 			if (lj->lioj_buffer_finished_count ==
2140 			    lj->lioj_buffer_count) {
2141 				/*
2142 				 * Post a signal if it is called for.
2143 				 */
2144 				if ((lj->lioj_flags &
2145 				    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2146 				    LIOJ_SIGNAL) {
2147 					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2148 					aiocbe->timeouthandle =
2149 						timeout(process_signal,
2150 							aiocbe, 0);
2151 				}
2152 			}
2153 		}
2154 
2155 		ki = p->p_aioinfo;
2156 		if (ki) {
2157 			ki->kaio_buffer_finished_count++;
2158 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2159 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2160 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2161 
2162 			KNOTE(&aiocbe->klist, 0);
2163 			/* Do the wakeup. */
2164 			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2165 				ki->kaio_flags &= ~KAIO_WAKEUP;
2166 				wakeup(p);
2167 			}
2168 		}
2169 
2170 		if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2171 			aiocbe->timeouthandle =
2172 				timeout(process_signal, aiocbe, 0);
2173 	}
2174 }
2175 
2176 /* syscall - wait for the next completion of an aio request */
2177 int
2178 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2179 {
2180 	struct proc *p = td->td_proc;
2181 	struct timeval atv;
2182 	struct timespec ts;
2183 	struct kaioinfo *ki;
2184 	struct aiocblist *cb = NULL;
2185 	int error, s, timo;
2186 
2187 	suword(uap->aiocbp, (int)NULL);
2188 
2189 	timo = 0;
2190 	if (uap->timeout) {
2191 		/* Get timespec struct. */
2192 		error = copyin(uap->timeout, &ts, sizeof(ts));
2193 		if (error)
2194 			return error;
2195 
2196 		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2197 			return (EINVAL);
2198 
2199 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
2200 		if (itimerfix(&atv))
2201 			return (EINVAL);
2202 		timo = tvtohz(&atv);
2203 	}
2204 
2205 	ki = p->p_aioinfo;
2206 	if (ki == NULL)
2207 		return EAGAIN;
2208 
2209 	for (;;) {
2210 		if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2211 			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2212 			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2213 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2214 				p->p_stats->p_ru.ru_oublock +=
2215 				    cb->outputcharge;
2216 				cb->outputcharge = 0;
2217 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2218 				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
2219 				cb->inputcharge = 0;
2220 			}
2221 			aio_free_entry(cb);
2222 			return cb->uaiocb._aiocb_private.error;
2223 		}
2224 
2225 		s = splbio();
2226  		if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2227 			splx(s);
2228 			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2229 			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2230 			aio_free_entry(cb);
2231 			return cb->uaiocb._aiocb_private.error;
2232 		}
2233 
2234 		ki->kaio_flags |= KAIO_WAKEUP;
2235 		error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2236 		splx(s);
2237 
2238 		if (error == ERESTART)
2239 			return EINTR;
2240 		else if (error < 0)
2241 			return error;
2242 		else if (error == EINTR)
2243 			return EINTR;
2244 		else if (error == EWOULDBLOCK)
2245 			return EAGAIN;
2246 	}
2247 }
2248 
2249 /* kqueue attach function */
2250 static int
2251 filt_aioattach(struct knote *kn)
2252 {
2253 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2254 
2255 	/*
2256 	 * The aiocbe pointer must be validated before using it, so
2257 	 * registration is restricted to the kernel; the user cannot
2258 	 * set EV_FLAG1.
2259 	 */
2260 	if ((kn->kn_flags & EV_FLAG1) == 0)
2261 		return (EPERM);
2262 	kn->kn_flags &= ~EV_FLAG1;
2263 
2264 	SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2265 
2266 	return (0);
2267 }
2268 
2269 /* kqueue detach function */
2270 static void
2271 filt_aiodetach(struct knote *kn)
2272 {
2273 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2274 
2275 	SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2276 }
2277 
2278 /* kqueue filter function */
2279 /*ARGSUSED*/
2280 static int
2281 filt_aio(struct knote *kn, long hint)
2282 {
2283 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2284 
2285 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2286 	if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2287 	    aiocbe->jobstate != JOBST_JOBBFINISHED)
2288 		return (0);
2289 	kn->kn_flags |= EV_EOF;
2290 	return (1);
2291 }
2292