xref: /freebsd/sys/kern/vfs_aio.c (revision adeb92a24c57f97d5cd3c3c45be239cbb23aed68)
1 /*
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  *
16  * $FreeBSD$
17  */
18 
19 /*
20  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21  */
22 
23 #include <sys/param.h>
24 #include <sys/systm.h>
25 #include <sys/bio.h>
26 #include <sys/buf.h>
27 #include <sys/sysproto.h>
28 #include <sys/filedesc.h>
29 #include <sys/kernel.h>
30 #include <sys/kthread.h>
31 #include <sys/fcntl.h>
32 #include <sys/file.h>
33 #include <sys/lock.h>
34 #include <sys/mutex.h>
35 #include <sys/unistd.h>
36 #include <sys/proc.h>
37 #include <sys/resourcevar.h>
38 #include <sys/signalvar.h>
39 #include <sys/protosw.h>
40 #include <sys/socketvar.h>
41 #include <sys/syscall.h>
42 #include <sys/sysent.h>
43 #include <sys/sysctl.h>
44 #include <sys/vnode.h>
45 #include <sys/conf.h>
46 #include <sys/event.h>
47 
48 #include <vm/vm.h>
49 #include <vm/vm_extern.h>
50 #include <vm/pmap.h>
51 #include <vm/vm_map.h>
52 #include <vm/vm_zone.h>
53 #include <sys/aio.h>
54 
55 #include <machine/limits.h>
56 
57 #include "opt_vfs_aio.h"
58 
59 static	long jobrefid;
60 
61 #define JOBST_NULL		0x0
62 #define	JOBST_JOBQPROC		0x1
63 #define JOBST_JOBQGLOBAL	0x2
64 #define JOBST_JOBRUNNING	0x3
65 #define JOBST_JOBFINISHED	0x4
66 #define	JOBST_JOBQBUF		0x5
67 #define	JOBST_JOBBFINISHED	0x6
68 
69 #ifndef MAX_AIO_PER_PROC
70 #define MAX_AIO_PER_PROC	32
71 #endif
72 
73 #ifndef MAX_AIO_QUEUE_PER_PROC
74 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
75 #endif
76 
77 #ifndef MAX_AIO_PROCS
78 #define MAX_AIO_PROCS		32
79 #endif
80 
81 #ifndef MAX_AIO_QUEUE
82 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
83 #endif
84 
85 #ifndef TARGET_AIO_PROCS
86 #define TARGET_AIO_PROCS	4
87 #endif
88 
89 #ifndef MAX_BUF_AIO
90 #define MAX_BUF_AIO		16
91 #endif
92 
93 #ifndef AIOD_TIMEOUT_DEFAULT
94 #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
95 #endif
96 
97 #ifndef AIOD_LIFETIME_DEFAULT
98 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
99 #endif
100 
101 static int max_aio_procs = MAX_AIO_PROCS;
102 static int num_aio_procs = 0;
103 static int target_aio_procs = TARGET_AIO_PROCS;
104 static int max_queue_count = MAX_AIO_QUEUE;
105 static int num_queue_count = 0;
106 static int num_buf_aio = 0;
107 static int num_aio_resv_start = 0;
108 static int aiod_timeout;
109 static int aiod_lifetime;
110 static int unloadable = 0;
111 
112 static int max_aio_per_proc = MAX_AIO_PER_PROC;
113 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
114 static int max_buf_aio = MAX_BUF_AIO;
115 
116 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
117 
118 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
119 	CTLFLAG_RW, &max_aio_per_proc, 0, "");
120 
121 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
122 	CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
123 
124 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
125 	CTLFLAG_RW, &max_aio_procs, 0, "");
126 
127 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
128 	CTLFLAG_RD, &num_aio_procs, 0, "");
129 
130 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
131 	CTLFLAG_RD, &num_queue_count, 0, "");
132 
133 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
134 	CTLFLAG_RW, &max_queue_count, 0, "");
135 
136 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
137 	CTLFLAG_RW, &target_aio_procs, 0, "");
138 
139 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
140 	CTLFLAG_RW, &max_buf_aio, 0, "");
141 
142 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
143 	CTLFLAG_RD, &num_buf_aio, 0, "");
144 
145 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
146 	CTLFLAG_RW, &aiod_lifetime, 0, "");
147 
148 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
149 	CTLFLAG_RW, &aiod_timeout, 0, "");
150 
151 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
152     "Allow unload of aio (not recommended)");
153 
154 /*
155  * AIO process info
156  */
157 #define AIOP_FREE	0x1			/* proc on free queue */
158 #define AIOP_SCHED	0x2			/* proc explicitly scheduled */
159 
160 struct aiothreadlist {
161 	int aiothreadflags;			/* AIO proc flags */
162 	TAILQ_ENTRY(aiothreadlist) list;	/* List of processes */
163 	struct thread *aiothread;		/* The AIO thread */
164 	TAILQ_HEAD(,aiocblist) jobtorun;	/* suggested job to run */
165 };
166 
167 /*
168  * data-structure for lio signal management
169  */
170 struct aio_liojob {
171 	int	lioj_flags;
172 	int	lioj_buffer_count;
173 	int	lioj_buffer_finished_count;
174 	int	lioj_queue_count;
175 	int	lioj_queue_finished_count;
176 	struct	sigevent lioj_signal;	/* signal on all I/O done */
177 	TAILQ_ENTRY(aio_liojob) lioj_list;
178 	struct	kaioinfo *lioj_ki;
179 };
180 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
181 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
182 
183 /*
184  * per process aio data structure
185  */
186 struct kaioinfo {
187 	int	kaio_flags;		/* per process kaio flags */
188 	int	kaio_maxactive_count;	/* maximum number of AIOs */
189 	int	kaio_active_count;	/* number of currently used AIOs */
190 	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
191 	int	kaio_queue_count;	/* size of AIO queue */
192 	int	kaio_ballowed_count;	/* maximum number of buffers */
193 	int	kaio_queue_finished_count; /* number of daemon jobs finished */
194 	int	kaio_buffer_count;	/* number of physio buffers */
195 	int	kaio_buffer_finished_count; /* count of I/O done */
196 	struct 	proc *kaio_p;		/* process that uses this kaio block */
197 	TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
198 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* job queue for process */
199 	TAILQ_HEAD(,aiocblist) kaio_jobdone;	/* done queue for process */
200 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* buffer job queue for process */
201 	TAILQ_HEAD(,aiocblist) kaio_bufdone;	/* buffer done queue for process */
202 	TAILQ_HEAD(,aiocblist) kaio_sockqueue;	/* queue for aios waiting on sockets */
203 };
204 
205 #define KAIO_RUNDOWN	0x1	/* process is being run down */
206 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
207 
208 static TAILQ_HEAD(,aiothreadlist) aio_freeproc, aio_activeproc;
209 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
210 static TAILQ_HEAD(,aiocblist) aio_bufjobs;		/* Phys I/O job list */
211 
212 static void	aio_init_aioinfo(struct proc *p);
213 static void	aio_onceonly(void);
214 static int	aio_free_entry(struct aiocblist *aiocbe);
215 static void	aio_process(struct aiocblist *aiocbe);
216 static int	aio_newproc(void);
217 static int	aio_aqueue(struct thread *td, struct aiocb *job, int type);
218 static void	aio_physwakeup(struct buf *bp);
219 static void	aio_proc_rundown(struct proc *p);
220 static int	aio_fphysio(struct proc *p, struct aiocblist *aiocbe);
221 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
222 static void	aio_daemon(void *uproc);
223 static int	aio_unload(void);
224 static void	process_signal(void *aioj);
225 static int	filt_aioattach(struct knote *kn);
226 static void	filt_aiodetach(struct knote *kn);
227 static int	filt_aio(struct knote *kn, long hint);
228 
229 static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone;
230 static vm_zone_t aiolio_zone;
231 
232 static struct filterops aio_filtops =
233 	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
234 
235 static int
236 aio_modload(struct module *module, int cmd, void *arg)
237 {
238 	int error = 0;
239 
240 	switch (cmd) {
241 	case MOD_LOAD:
242 		aio_onceonly();
243 		break;
244 	case MOD_UNLOAD:
245 		error = aio_unload();
246 		break;
247 	case MOD_SHUTDOWN:
248 		break;
249 	default:
250 		error = EINVAL;
251 		break;
252 	}
253 	return (error);
254 }
255 
256 static moduledata_t aio_mod = {
257 	"aio",
258 	&aio_modload,
259 	NULL
260 };
261 
262 SYSCALL_MODULE_HELPER(aio_return);
263 SYSCALL_MODULE_HELPER(aio_suspend);
264 SYSCALL_MODULE_HELPER(aio_cancel);
265 SYSCALL_MODULE_HELPER(aio_error);
266 SYSCALL_MODULE_HELPER(aio_read);
267 SYSCALL_MODULE_HELPER(aio_write);
268 SYSCALL_MODULE_HELPER(aio_waitcomplete);
269 SYSCALL_MODULE_HELPER(lio_listio);
270 
271 DECLARE_MODULE(aio, aio_mod,
272 	SI_SUB_VFS, SI_ORDER_ANY);
273 MODULE_VERSION(aio, 1);
274 
275 /*
276  * Startup initialization
277  */
278 static void
279 aio_onceonly(void)
280 {
281 
282 	/* XXX: should probably just use so->callback */
283 	aio_swake = &aio_swake_cb;
284 	at_exit(aio_proc_rundown);
285 	at_exec(aio_proc_rundown);
286 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
287 	TAILQ_INIT(&aio_freeproc);
288 	TAILQ_INIT(&aio_activeproc);
289 	TAILQ_INIT(&aio_jobs);
290 	TAILQ_INIT(&aio_bufjobs);
291 	kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1);
292 	aiop_zone = zinit("AIOP", sizeof(struct aiothreadlist), 0, 0, 1);
293 	aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1);
294 	aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof(int), 0, 0, 1);
295 	aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof(struct
296 	    aio_liojob), 0, 0, 1);
297 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
298 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
299 	jobrefid = 1;
300 }
301 
302 static int
303 aio_unload(void)
304 {
305 
306 	/*
307 	 * XXX: no unloads by default, it's too dangerous.
308 	 * perhaps we could do it if locked out callers and then
309 	 * did an aio_proc_rundown() on each process.
310 	 */
311 	if (!unloadable)
312 		return (EOPNOTSUPP);
313 
314 	aio_swake = NULL;
315 	rm_at_exit(aio_proc_rundown);
316 	rm_at_exec(aio_proc_rundown);
317 	kqueue_del_filteropts(EVFILT_AIO);
318 	return (0);
319 }
320 
321 /*
322  * Init the per-process aioinfo structure.  The aioinfo limits are set
323  * per-process for user limit (resource) management.
324  */
325 static void
326 aio_init_aioinfo(struct proc *p)
327 {
328 	struct kaioinfo *ki;
329 	if (p->p_aioinfo == NULL) {
330 		ki = zalloc(kaio_zone);
331 		p->p_aioinfo = ki;
332 		ki->kaio_flags = 0;
333 		ki->kaio_maxactive_count = max_aio_per_proc;
334 		ki->kaio_active_count = 0;
335 		ki->kaio_qallowed_count = max_aio_queue_per_proc;
336 		ki->kaio_queue_count = 0;
337 		ki->kaio_ballowed_count = max_buf_aio;
338 		ki->kaio_buffer_count = 0;
339 		ki->kaio_buffer_finished_count = 0;
340 		ki->kaio_p = p;
341 		TAILQ_INIT(&ki->kaio_jobdone);
342 		TAILQ_INIT(&ki->kaio_jobqueue);
343 		TAILQ_INIT(&ki->kaio_bufdone);
344 		TAILQ_INIT(&ki->kaio_bufqueue);
345 		TAILQ_INIT(&ki->kaio_liojoblist);
346 		TAILQ_INIT(&ki->kaio_sockqueue);
347 	}
348 
349 	while (num_aio_procs < target_aio_procs)
350 		aio_newproc();
351 }
352 
353 /*
354  * Free a job entry.  Wait for completion if it is currently active, but don't
355  * delay forever.  If we delay, we return a flag that says that we have to
356  * restart the queue scan.
357  */
358 static int
359 aio_free_entry(struct aiocblist *aiocbe)
360 {
361 	struct kaioinfo *ki;
362 	struct aiothreadlist *aiop;
363 	struct aio_liojob *lj;
364 	struct proc *p;
365 	int error;
366 	int s;
367 
368 	if (aiocbe->jobstate == JOBST_NULL)
369 		panic("aio_free_entry: freeing already free job");
370 
371 	p = aiocbe->userproc;
372 	ki = p->p_aioinfo;
373 	lj = aiocbe->lio;
374 	if (ki == NULL)
375 		panic("aio_free_entry: missing p->p_aioinfo");
376 
377 	while (aiocbe->jobstate == JOBST_JOBRUNNING) {
378 		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
379 			return 0;
380 		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
381 		tsleep(aiocbe, PRIBIO, "jobwai", 0);
382 	}
383 	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
384 
385 	if (aiocbe->bp == NULL) {
386 		if (ki->kaio_queue_count <= 0)
387 			panic("aio_free_entry: process queue size <= 0");
388 		if (num_queue_count <= 0)
389 			panic("aio_free_entry: system wide queue size <= 0");
390 
391 		if (lj) {
392 			lj->lioj_queue_count--;
393 			if (aiocbe->jobflags & AIOCBLIST_DONE)
394 				lj->lioj_queue_finished_count--;
395 		}
396 		ki->kaio_queue_count--;
397 		if (aiocbe->jobflags & AIOCBLIST_DONE)
398 			ki->kaio_queue_finished_count--;
399 		num_queue_count--;
400 	} else {
401 		if (lj) {
402 			lj->lioj_buffer_count--;
403 			if (aiocbe->jobflags & AIOCBLIST_DONE)
404 				lj->lioj_buffer_finished_count--;
405 		}
406 		if (aiocbe->jobflags & AIOCBLIST_DONE)
407 			ki->kaio_buffer_finished_count--;
408 		ki->kaio_buffer_count--;
409 		num_buf_aio--;
410 	}
411 
412 	/* aiocbe is going away, we need to destroy any knotes */
413 	knote_remove(&p->p_thread, &aiocbe->klist); /* XXXKSE */
414 	/* XXXKSE Note the thread here is used to eventually find the
415 	 * owning process again, but it is also used to do a fo_close
416 	 * and that requires the thread. (but does it require the
417 	 * OWNING thread? (or maby the running thread?)
418 	 * There is a semantic problem here...
419 	 */
420 
421 	if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
422 	    && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
423 		ki->kaio_flags &= ~KAIO_WAKEUP;
424 		wakeup(p);
425 	}
426 
427 	if (aiocbe->jobstate == JOBST_JOBQBUF) {
428 		if ((error = aio_fphysio(p, aiocbe)) != 0)
429 			return error;
430 		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
431 			panic("aio_free_entry: invalid physio finish-up state");
432 		s = splbio();
433 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
434 		splx(s);
435 	} else if (aiocbe->jobstate == JOBST_JOBQPROC) {
436 		aiop = aiocbe->jobaiothread;
437 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
438 	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
439 		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
440 		TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
441 	} else if (aiocbe->jobstate == JOBST_JOBFINISHED)
442 		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
443 	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
444 		s = splbio();
445 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
446 		splx(s);
447 		if (aiocbe->bp) {
448 			vunmapbuf(aiocbe->bp);
449 			relpbuf(aiocbe->bp, NULL);
450 			aiocbe->bp = NULL;
451 		}
452 	}
453 	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
454 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
455 		zfree(aiolio_zone, lj);
456 	}
457 	aiocbe->jobstate = JOBST_NULL;
458 	untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
459 	zfree(aiocb_zone, aiocbe);
460 	return 0;
461 }
462 
463 /*
464  * Rundown the jobs for a given process.
465  */
466 static void
467 aio_proc_rundown(struct proc *p)
468 {
469 	int s;
470 	struct kaioinfo *ki;
471 	struct aio_liojob *lj, *ljn;
472 	struct aiocblist *aiocbe, *aiocbn;
473 	struct file *fp;
474 	struct filedesc *fdp;
475 	struct socket *so;
476 
477 	ki = p->p_aioinfo;
478 	if (ki == NULL)
479 		return;
480 
481 	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
482 	while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
483 	    ki->kaio_buffer_finished_count)) {
484 		ki->kaio_flags |= KAIO_RUNDOWN;
485 		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
486 			break;
487 	}
488 
489 	/*
490 	 * Move any aio ops that are waiting on socket I/O to the normal job
491 	 * queues so they are cleaned up with any others.
492 	 */
493 	fdp = p->p_fd;
494 
495 	s = splnet();
496 	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
497 	    aiocbn) {
498 		aiocbn = TAILQ_NEXT(aiocbe, plist);
499 		fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes];
500 
501 		/*
502 		 * Under some circumstances, the aio_fildes and the file
503 		 * structure don't match.  This would leave aiocbe's in the
504 		 * TAILQ associated with the socket and cause a panic later.
505 		 *
506 		 * Detect and fix.
507 		 */
508 		if ((fp == NULL) || (fp != aiocbe->fd_file))
509 			fp = aiocbe->fd_file;
510 		if (fp) {
511 			so = (struct socket *)fp->f_data;
512 			TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
513 			if (TAILQ_EMPTY(&so->so_aiojobq)) {
514 				so->so_snd.sb_flags &= ~SB_AIO;
515 				so->so_rcv.sb_flags &= ~SB_AIO;
516 			}
517 		}
518 		TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
519 		TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
520 		TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
521 	}
522 	splx(s);
523 
524 restart1:
525 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
526 		aiocbn = TAILQ_NEXT(aiocbe, plist);
527 		if (aio_free_entry(aiocbe))
528 			goto restart1;
529 	}
530 
531 restart2:
532 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
533 	    aiocbn) {
534 		aiocbn = TAILQ_NEXT(aiocbe, plist);
535 		if (aio_free_entry(aiocbe))
536 			goto restart2;
537 	}
538 
539 /*
540  * Note the use of lots of splbio here, trying to avoid splbio for long chains
541  * of I/O.  Probably unnecessary.
542  */
543 restart3:
544 	s = splbio();
545 	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
546 		ki->kaio_flags |= KAIO_WAKEUP;
547 		tsleep(p, PRIBIO, "aioprn", 0);
548 		splx(s);
549 		goto restart3;
550 	}
551 	splx(s);
552 
553 restart4:
554 	s = splbio();
555 	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
556 		aiocbn = TAILQ_NEXT(aiocbe, plist);
557 		if (aio_free_entry(aiocbe)) {
558 			splx(s);
559 			goto restart4;
560 		}
561 	}
562 	splx(s);
563 
564         /*
565          * If we've slept, jobs might have moved from one queue to another.
566          * Retry rundown if we didn't manage to empty the queues.
567          */
568         if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
569 	    TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
570 	    TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
571 	    TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
572 		goto restart1;
573 
574 	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
575 		ljn = TAILQ_NEXT(lj, lioj_list);
576 		if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
577 		    0)) {
578 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
579 			zfree(aiolio_zone, lj);
580 		} else {
581 #ifdef DIAGNOSTIC
582 			printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
583 			    "QF:%d\n", lj->lioj_buffer_count,
584 			    lj->lioj_buffer_finished_count,
585 			    lj->lioj_queue_count,
586 			    lj->lioj_queue_finished_count);
587 #endif
588 		}
589 	}
590 
591 	zfree(kaio_zone, ki);
592 	p->p_aioinfo = NULL;
593 }
594 
595 /*
596  * Select a job to run (called by an AIO daemon).
597  */
598 static struct aiocblist *
599 aio_selectjob(struct aiothreadlist *aiop)
600 {
601 	int s;
602 	struct aiocblist *aiocbe;
603 	struct kaioinfo *ki;
604 	struct proc *userp;
605 
606 	aiocbe = TAILQ_FIRST(&aiop->jobtorun);
607 	if (aiocbe) {
608 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
609 		return aiocbe;
610 	}
611 
612 	s = splnet();
613 	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
614 	    TAILQ_NEXT(aiocbe, list)) {
615 		userp = aiocbe->userproc;
616 		ki = userp->p_aioinfo;
617 
618 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
619 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
620 			splx(s);
621 			return aiocbe;
622 		}
623 	}
624 	splx(s);
625 
626 	return NULL;
627 }
628 
629 /*
630  * The AIO processing activity.  This is the code that does the I/O request for
631  * the non-physio version of the operations.  The normal vn operations are used,
632  * and this code should work in all instances for every type of file, including
633  * pipes, sockets, fifos, and regular files.
634  */
635 static void
636 aio_process(struct aiocblist *aiocbe)
637 {
638 	struct filedesc *fdp;
639 	struct thread *td;
640 	struct proc *userp;
641 	struct proc *mycp;
642 	struct aiocb *cb;
643 	struct file *fp;
644 	struct uio auio;
645 	struct iovec aiov;
646 	unsigned int fd;
647 	int cnt;
648 	int error;
649 	off_t offset;
650 	int oublock_st, oublock_end;
651 	int inblock_st, inblock_end;
652 
653 	userp = aiocbe->userproc;
654 	td = curthread;
655 	mycp = td->td_proc;
656 	cb = &aiocbe->uaiocb;
657 
658 	fdp = mycp->p_fd;
659 	fd = cb->aio_fildes;
660 	fp = fdp->fd_ofiles[fd];
661 
662 	if ((fp == NULL) || (fp != aiocbe->fd_file)) {
663 		cb->_aiocb_private.error = EBADF;
664 		cb->_aiocb_private.status = -1;
665 		return;
666 	}
667 
668 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
669 	aiov.iov_len = cb->aio_nbytes;
670 
671 	auio.uio_iov = &aiov;
672 	auio.uio_iovcnt = 1;
673 	auio.uio_offset = offset = cb->aio_offset;
674 	auio.uio_resid = cb->aio_nbytes;
675 	cnt = cb->aio_nbytes;
676 	auio.uio_segflg = UIO_USERSPACE;
677 	auio.uio_td = td;
678 
679 	inblock_st = mycp->p_stats->p_ru.ru_inblock;
680 	oublock_st = mycp->p_stats->p_ru.ru_oublock;
681 	/*
682 	 * Temporarily bump the ref count while reading to avoid the
683 	 * descriptor being ripped out from under us.
684 	 */
685 	fhold(fp);
686 	if (cb->aio_lio_opcode == LIO_READ) {
687 		auio.uio_rw = UIO_READ;
688 		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
689 	} else {
690 		auio.uio_rw = UIO_WRITE;
691 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
692 	}
693 	fdrop(fp, td);
694 	inblock_end = mycp->p_stats->p_ru.ru_inblock;
695 	oublock_end = mycp->p_stats->p_ru.ru_oublock;
696 
697 	aiocbe->inputcharge = inblock_end - inblock_st;
698 	aiocbe->outputcharge = oublock_end - oublock_st;
699 
700 	if ((error) && (auio.uio_resid != cnt)) {
701 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
702 			error = 0;
703 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
704 			PROC_LOCK(userp);
705 			psignal(userp, SIGPIPE);
706 			PROC_UNLOCK(userp);
707 		}
708 	}
709 
710 	cnt -= auio.uio_resid;
711 	cb->_aiocb_private.error = error;
712 	cb->_aiocb_private.status = cnt;
713 }
714 
715 /*
716  * The AIO daemon, most of the actual work is done in aio_process,
717  * but the setup (and address space mgmt) is done in this routine.
718  */
719 static void
720 aio_daemon(void *uproc)
721 {
722 	int s;
723 	struct aio_liojob *lj;
724 	struct aiocb *cb;
725 	struct aiocblist *aiocbe;
726 	struct aiothreadlist *aiop;
727 	struct kaioinfo *ki;
728 	struct proc *curcp, *mycp, *userp;
729 	struct vmspace *myvm, *tmpvm;
730 	struct thread *td = curthread;
731 
732 	mtx_lock(&Giant);
733 	/*
734 	 * Local copies of curproc (cp) and vmspace (myvm)
735 	 */
736 	mycp = td->td_proc;
737 	myvm = mycp->p_vmspace;
738 
739 	if (mycp->p_textvp) {
740 		vrele(mycp->p_textvp);
741 		mycp->p_textvp = NULL;
742 	}
743 
744 	/*
745 	 * Allocate and ready the aio control info.  There is one aiop structure
746 	 * per daemon.
747 	 */
748 	aiop = zalloc(aiop_zone);
749 	aiop->aiothread = td;
750 	aiop->aiothreadflags |= AIOP_FREE;
751 	TAILQ_INIT(&aiop->jobtorun);
752 
753 	s = splnet();
754 
755 	/*
756 	 * Place thread (lightweight process) onto the AIO free thread list.
757 	 */
758 	if (TAILQ_EMPTY(&aio_freeproc))
759 		wakeup(&aio_freeproc);
760 	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
761 
762 	splx(s);
763 
764 	/*
765 	 * Get rid of our current filedescriptors.  AIOD's don't need any
766 	 * filedescriptors, except as temporarily inherited from the client.
767 	 */
768 	fdfree(td);
769 	mycp->p_fd = NULL;
770 
771 	/* The daemon resides in its own pgrp. */
772 	enterpgrp(mycp, mycp->p_pid, 1);
773 
774 	/* Mark special process type. */
775 	mycp->p_flag |= P_SYSTEM;
776 
777 	/*
778 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
779 	 * and creating too many daemons.)
780 	 */
781 	wakeup(mycp);
782 
783 	for (;;) {
784 		/*
785 		 * curcp is the current daemon process context.
786 		 * userp is the current user process context.
787 		 */
788 		curcp = mycp;
789 
790 		/*
791 		 * Take daemon off of free queue
792 		 */
793 		if (aiop->aiothreadflags & AIOP_FREE) {
794 			s = splnet();
795 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
796 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
797 			aiop->aiothreadflags &= ~AIOP_FREE;
798 			splx(s);
799 		}
800 		aiop->aiothreadflags &= ~AIOP_SCHED;
801 
802 		/*
803 		 * Check for jobs.
804 		 */
805 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
806 			cb = &aiocbe->uaiocb;
807 			userp = aiocbe->userproc;
808 
809 			aiocbe->jobstate = JOBST_JOBRUNNING;
810 
811 			/*
812 			 * Connect to process address space for user program.
813 			 */
814 			if (userp != curcp) {
815 				/*
816 				 * Save the current address space that we are
817 				 * connected to.
818 				 */
819 				tmpvm = mycp->p_vmspace;
820 
821 				/*
822 				 * Point to the new user address space, and
823 				 * refer to it.
824 				 */
825 				mycp->p_vmspace = userp->p_vmspace;
826 				mycp->p_vmspace->vm_refcnt++;
827 
828 				/* Activate the new mapping. */
829 				pmap_activate(&mycp->p_thread);
830 
831 				/*
832 				 * If the old address space wasn't the daemons
833 				 * own address space, then we need to remove the
834 				 * daemon's reference from the other process
835 				 * that it was acting on behalf of.
836 				 */
837 				if (tmpvm != myvm) {
838 					vmspace_free(tmpvm);
839 				}
840 
841 				/*
842 				 * Disassociate from previous clients file
843 				 * descriptors, and associate to the new clients
844 				 * descriptors.  Note that the daemon doesn't
845 				 * need to worry about its orginal descriptors,
846 				 * because they were originally freed.
847 				 */
848 				if (mycp->p_fd)
849 					fdfree(td);
850 				mycp->p_fd = fdshare(userp);
851 				curcp = userp;
852 			}
853 
854 			ki = userp->p_aioinfo;
855 			lj = aiocbe->lio;
856 
857 			/* Account for currently active jobs. */
858 			ki->kaio_active_count++;
859 
860 			/* Do the I/O function. */
861 			aiocbe->jobaiothread = aiop;
862 			aio_process(aiocbe);
863 
864 			/* Decrement the active job count. */
865 			ki->kaio_active_count--;
866 
867 			/*
868 			 * Increment the completion count for wakeup/signal
869 			 * comparisons.
870 			 */
871 			aiocbe->jobflags |= AIOCBLIST_DONE;
872 			ki->kaio_queue_finished_count++;
873 			if (lj)
874 				lj->lioj_queue_finished_count++;
875 			if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
876 			    & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
877 				ki->kaio_flags &= ~KAIO_WAKEUP;
878 				wakeup(userp);
879 			}
880 
881 			s = splbio();
882 			if (lj && (lj->lioj_flags &
883 			    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
884 				if ((lj->lioj_queue_finished_count ==
885 				    lj->lioj_queue_count) &&
886 				    (lj->lioj_buffer_finished_count ==
887 				    lj->lioj_buffer_count)) {
888 					PROC_LOCK(userp);
889 					psignal(userp,
890 					    lj->lioj_signal.sigev_signo);
891 					PROC_UNLOCK(userp);
892 					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
893 				}
894 			}
895 			splx(s);
896 
897 			aiocbe->jobstate = JOBST_JOBFINISHED;
898 
899 			/*
900 			 * If the I/O request should be automatically rundown,
901 			 * do the needed cleanup.  Otherwise, place the queue
902 			 * entry for the just finished I/O request into the done
903 			 * queue for the associated client.
904 			 */
905 			s = splnet();
906 			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
907 				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
908 				zfree(aiocb_zone, aiocbe);
909 			} else {
910 				TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
911 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
912 				    plist);
913 			}
914 			splx(s);
915 			KNOTE(&aiocbe->klist, 0);
916 
917 			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
918 				wakeup(aiocbe);
919 				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
920 			}
921 
922 			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
923 				PROC_LOCK(userp);
924 				psignal(userp, cb->aio_sigevent.sigev_signo);
925 				PROC_UNLOCK(userp);
926 			}
927 		}
928 
929 		/*
930 		 * Disconnect from user address space.
931 		 */
932 		if (curcp != mycp) {
933 			/* Get the user address space to disconnect from. */
934 			tmpvm = mycp->p_vmspace;
935 
936 			/* Get original address space for daemon. */
937 			mycp->p_vmspace = myvm;
938 
939 			/* Activate the daemon's address space. */
940 			pmap_activate(&mycp->p_thread);
941 #ifdef DIAGNOSTIC
942 			if (tmpvm == myvm) {
943 				printf("AIOD: vmspace problem -- %d\n",
944 				    mycp->p_pid);
945 			}
946 #endif
947 			/* Remove our vmspace reference. */
948 			vmspace_free(tmpvm);
949 
950 			/*
951 			 * Disassociate from the user process's file
952 			 * descriptors.
953 			 */
954 			if (mycp->p_fd)
955 				fdfree(td);
956 			mycp->p_fd = NULL;
957 			curcp = mycp;
958 		}
959 
960 		/*
961 		 * If we are the first to be put onto the free queue, wakeup
962 		 * anyone waiting for a daemon.
963 		 */
964 		s = splnet();
965 		TAILQ_REMOVE(&aio_activeproc, aiop, list);
966 		if (TAILQ_EMPTY(&aio_freeproc))
967 			wakeup(&aio_freeproc);
968 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
969 		aiop->aiothreadflags |= AIOP_FREE;
970 		splx(s);
971 
972 		/*
973 		 * If daemon is inactive for a long time, allow it to exit,
974 		 * thereby freeing resources.
975 		 */
976 		if (((aiop->aiothreadflags & AIOP_SCHED) == 0) && tsleep(mycp,
977 		    PRIBIO, "aiordy", aiod_lifetime)) {
978 			s = splnet();
979 			if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
980 			    (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
981 				if ((aiop->aiothreadflags & AIOP_FREE) &&
982 				    (num_aio_procs > target_aio_procs)) {
983 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
984 					splx(s);
985 					zfree(aiop_zone, aiop);
986 					num_aio_procs--;
987 #ifdef DIAGNOSTIC
988 					if (mycp->p_vmspace->vm_refcnt <= 1) {
989 						printf("AIOD: bad vm refcnt for"
990 						    " exiting daemon: %d\n",
991 						    mycp->p_vmspace->vm_refcnt);
992 					}
993 #endif
994 					kthread_exit(0);
995 				}
996 			}
997 			splx(s);
998 		}
999 	}
1000 }
1001 
1002 /*
1003  * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
1004  * AIO daemon modifies its environment itself.
1005  */
1006 static int
1007 aio_newproc()
1008 {
1009 	int error;
1010 	struct proc *p;
1011 
1012 	error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d",
1013 			       num_aio_procs);
1014 	if (error)
1015 		return error;
1016 
1017 	/*
1018 	 * Wait until daemon is started, but continue on just in case to
1019 	 * handle error conditions.
1020 	 */
1021 	error = tsleep(p, PZERO, "aiosta", aiod_timeout);
1022 
1023 	num_aio_procs++;
1024 
1025 	return error;
1026 }
1027 
1028 /*
1029  * Try the high-performance, low-overhead physio method for eligible
1030  * VCHR devices.  This method doesn't use an aio helper thread, and
1031  * thus has very low overhead.
1032  *
1033  * Assumes that the caller, _aio_aqueue(), has incremented the file
1034  * structure's reference count, preventing its deallocation for the
1035  * duration of this call.
1036  */
1037 static int
1038 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1039 {
1040 	int error;
1041 	struct aiocb *cb;
1042 	struct file *fp;
1043 	struct buf *bp;
1044 	struct vnode *vp;
1045 	struct kaioinfo *ki;
1046 	struct filedesc *fdp;
1047 	struct aio_liojob *lj;
1048 	int fd;
1049 	int s;
1050 	int notify;
1051 
1052 	cb = &aiocbe->uaiocb;
1053 	fdp = p->p_fd;
1054 	fd = cb->aio_fildes;
1055 	fp = fdp->fd_ofiles[fd];
1056 
1057 	if (fp->f_type != DTYPE_VNODE)
1058 		return (-1);
1059 
1060 	vp = (struct vnode *)fp->f_data;
1061 
1062 	/*
1063 	 * If its not a disk, we don't want to return a positive error.
1064 	 * It causes the aio code to not fall through to try the thread
1065 	 * way when you're talking to a regular file.
1066 	 */
1067 	if (!vn_isdisk(vp, &error)) {
1068 		if (error == ENOTBLK)
1069 			return (-1);
1070 		else
1071 			return (error);
1072 	}
1073 
1074  	if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
1075 		return (-1);
1076 
1077 	if (cb->aio_nbytes >
1078 	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1079 		return (-1);
1080 
1081 	ki = p->p_aioinfo;
1082 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1083 		return (-1);
1084 
1085 	ki->kaio_buffer_count++;
1086 
1087 	lj = aiocbe->lio;
1088 	if (lj)
1089 		lj->lioj_buffer_count++;
1090 
1091 	/* Create and build a buffer header for a transfer. */
1092 	bp = (struct buf *)getpbuf(NULL);
1093 	BUF_KERNPROC(bp);
1094 
1095 	/*
1096 	 * Get a copy of the kva from the physical buffer.
1097 	 */
1098 	bp->b_caller1 = p;
1099 	bp->b_dev = vp->v_rdev;
1100 	error = bp->b_error = 0;
1101 
1102 	bp->b_bcount = cb->aio_nbytes;
1103 	bp->b_bufsize = cb->aio_nbytes;
1104 	bp->b_flags = B_PHYS;
1105 	bp->b_iodone = aio_physwakeup;
1106 	bp->b_saveaddr = bp->b_data;
1107 	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1108 	bp->b_blkno = btodb(cb->aio_offset);
1109 
1110 	if (cb->aio_lio_opcode == LIO_WRITE) {
1111 		bp->b_iocmd = BIO_WRITE;
1112 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1113 			error = EFAULT;
1114 			goto doerror;
1115 		}
1116 	} else {
1117 		bp->b_iocmd = BIO_READ;
1118 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1119 			error = EFAULT;
1120 			goto doerror;
1121 		}
1122 	}
1123 
1124 	/* Bring buffer into kernel space. */
1125 	vmapbuf(bp);
1126 
1127 	s = splbio();
1128 	aiocbe->bp = bp;
1129 	bp->b_spc = (void *)aiocbe;
1130 	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1131 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1132 	aiocbe->jobstate = JOBST_JOBQBUF;
1133 	cb->_aiocb_private.status = cb->aio_nbytes;
1134 	num_buf_aio++;
1135 	bp->b_error = 0;
1136 
1137 	splx(s);
1138 
1139 	/* Perform transfer. */
1140 	DEV_STRATEGY(bp, 0);
1141 
1142 	notify = 0;
1143 	s = splbio();
1144 
1145 	/*
1146 	 * If we had an error invoking the request, or an error in processing
1147 	 * the request before we have returned, we process it as an error in
1148 	 * transfer.  Note that such an I/O error is not indicated immediately,
1149 	 * but is returned using the aio_error mechanism.  In this case,
1150 	 * aio_suspend will return immediately.
1151 	 */
1152 	if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1153 		struct aiocb *job = aiocbe->uuaiocb;
1154 
1155 		aiocbe->uaiocb._aiocb_private.status = 0;
1156 		suword(&job->_aiocb_private.status, 0);
1157 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1158 		suword(&job->_aiocb_private.error, bp->b_error);
1159 
1160 		ki->kaio_buffer_finished_count++;
1161 
1162 		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1163 			aiocbe->jobstate = JOBST_JOBBFINISHED;
1164 			aiocbe->jobflags |= AIOCBLIST_DONE;
1165 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1166 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1167 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1168 			notify = 1;
1169 		}
1170 	}
1171 	splx(s);
1172 	if (notify)
1173 		KNOTE(&aiocbe->klist, 0);
1174 	return 0;
1175 
1176 doerror:
1177 	ki->kaio_buffer_count--;
1178 	if (lj)
1179 		lj->lioj_buffer_count--;
1180 	aiocbe->bp = NULL;
1181 	relpbuf(bp, NULL);
1182 	return error;
1183 }
1184 
1185 /*
1186  * This waits/tests physio completion.
1187  */
1188 static int
1189 aio_fphysio(struct proc *p, struct aiocblist *iocb)
1190 {
1191 	int s;
1192 	struct buf *bp;
1193 	int error;
1194 
1195 	bp = iocb->bp;
1196 
1197 	s = splbio();
1198 	while ((bp->b_flags & B_DONE) == 0) {
1199 		if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
1200 			if ((bp->b_flags & B_DONE) == 0) {
1201 				splx(s);
1202 				return EINPROGRESS;
1203 			} else
1204 				break;
1205 		}
1206 	}
1207 	splx(s);
1208 
1209 	/* Release mapping into kernel space. */
1210 	vunmapbuf(bp);
1211 	iocb->bp = 0;
1212 
1213 	error = 0;
1214 
1215 	/* Check for an error. */
1216 	if (bp->b_ioflags & BIO_ERROR)
1217 		error = bp->b_error;
1218 
1219 	relpbuf(bp, NULL);
1220 	return (error);
1221 }
1222 
1223 /*
1224  * Wake up aio requests that may be serviceable now.
1225  */
1226 void
1227 aio_swake_cb(struct socket *so, struct sockbuf *sb)
1228 {
1229 	struct aiocblist *cb,*cbn;
1230 	struct proc *p;
1231 	struct kaioinfo *ki = NULL;
1232 	int opcode, wakecount = 0;
1233 	struct aiothreadlist *aiop;
1234 
1235 	if (sb == &so->so_snd) {
1236 		opcode = LIO_WRITE;
1237 		so->so_snd.sb_flags &= ~SB_AIO;
1238 	} else {
1239 		opcode = LIO_READ;
1240 		so->so_rcv.sb_flags &= ~SB_AIO;
1241 	}
1242 
1243 	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1244 		cbn = TAILQ_NEXT(cb, list);
1245 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1246 			p = cb->userproc;
1247 			ki = p->p_aioinfo;
1248 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1249 			TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1250 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1251 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1252 			wakecount++;
1253 			if (cb->jobstate != JOBST_JOBQGLOBAL)
1254 				panic("invalid queue value");
1255 		}
1256 	}
1257 
1258 	while (wakecount--) {
1259 		if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1260 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1261 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1262 			aiop->aiothreadflags &= ~AIOP_FREE;
1263 			wakeup(aiop->aiothread);
1264 		}
1265 	}
1266 }
1267 
1268 /*
1269  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1270  * technique is done in this code.
1271  */
1272 static int
1273 _aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
1274 {
1275 	struct proc *p = td->td_proc;
1276 	struct filedesc *fdp;
1277 	struct file *fp;
1278 	unsigned int fd;
1279 	struct socket *so;
1280 	int s;
1281 	int error;
1282 	int opcode;
1283 	struct aiocblist *aiocbe;
1284 	struct aiothreadlist *aiop;
1285 	struct kaioinfo *ki;
1286 	struct kevent kev;
1287 	struct kqueue *kq;
1288 	struct file *kq_fp;
1289 
1290 	aiocbe = zalloc(aiocb_zone);
1291 	aiocbe->inputcharge = 0;
1292 	aiocbe->outputcharge = 0;
1293 	callout_handle_init(&aiocbe->timeouthandle);
1294 	SLIST_INIT(&aiocbe->klist);
1295 
1296 	suword(&job->_aiocb_private.status, -1);
1297 	suword(&job->_aiocb_private.error, 0);
1298 	suword(&job->_aiocb_private.kernelinfo, -1);
1299 
1300 	error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1301 	if (error) {
1302 		suword(&job->_aiocb_private.error, error);
1303 		zfree(aiocb_zone, aiocbe);
1304 		return error;
1305 	}
1306 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1307 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1308 		zfree(aiocb_zone, aiocbe);
1309 		return EINVAL;
1310 	}
1311 
1312 	/* Save userspace address of the job info. */
1313 	aiocbe->uuaiocb = job;
1314 
1315 	/* Get the opcode. */
1316 	if (type != LIO_NOP)
1317 		aiocbe->uaiocb.aio_lio_opcode = type;
1318 	opcode = aiocbe->uaiocb.aio_lio_opcode;
1319 
1320 	/* Get the fd info for process. */
1321 	fdp = p->p_fd;
1322 
1323 	/*
1324 	 * Range check file descriptor.
1325 	 */
1326 	fd = aiocbe->uaiocb.aio_fildes;
1327 	if (fd >= fdp->fd_nfiles) {
1328 		zfree(aiocb_zone, aiocbe);
1329 		if (type == 0)
1330 			suword(&job->_aiocb_private.error, EBADF);
1331 		return EBADF;
1332 	}
1333 
1334 	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1335 	if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1336 	    0))) {
1337 		zfree(aiocb_zone, aiocbe);
1338 		if (type == 0)
1339 			suword(&job->_aiocb_private.error, EBADF);
1340 		return EBADF;
1341 	}
1342 
1343 	if (aiocbe->uaiocb.aio_offset == -1LL) {
1344 		zfree(aiocb_zone, aiocbe);
1345 		if (type == 0)
1346 			suword(&job->_aiocb_private.error, EINVAL);
1347 		return EINVAL;
1348 	}
1349 
1350 	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1351 	if (error) {
1352 		zfree(aiocb_zone, aiocbe);
1353 		if (type == 0)
1354 			suword(&job->_aiocb_private.error, EINVAL);
1355 		return error;
1356 	}
1357 
1358 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1359 	if (jobrefid == LONG_MAX)
1360 		jobrefid = 1;
1361 	else
1362 		jobrefid++;
1363 
1364 	if (opcode == LIO_NOP) {
1365 		zfree(aiocb_zone, aiocbe);
1366 		if (type == 0) {
1367 			suword(&job->_aiocb_private.error, 0);
1368 			suword(&job->_aiocb_private.status, 0);
1369 			suword(&job->_aiocb_private.kernelinfo, 0);
1370 		}
1371 		return 0;
1372 	}
1373 
1374 	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1375 		zfree(aiocb_zone, aiocbe);
1376 		if (type == 0) {
1377 			suword(&job->_aiocb_private.status, 0);
1378 			suword(&job->_aiocb_private.error, EINVAL);
1379 		}
1380 		return EINVAL;
1381 	}
1382 
1383 	fhold(fp);
1384 
1385 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1386 		kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1387 		kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1388 	}
1389 	else {
1390 		/*
1391 		 * This method for requesting kevent-based notification won't
1392 		 * work on the alpha, since we're passing in a pointer
1393 		 * via aio_lio_opcode, which is an int.  Use the SIGEV_KEVENT-
1394 		 * based method instead.
1395 		 */
1396 		struct kevent *kevp;
1397 
1398 		kevp = (struct kevent *)(uintptr_t)job->aio_lio_opcode;
1399 		if (kevp == NULL)
1400 			goto no_kqueue;
1401 
1402 		error = copyin(kevp, &kev, sizeof(kev));
1403 		if (error)
1404 			goto aqueue_fail;
1405 	}
1406 	if ((u_int)kev.ident >= fdp->fd_nfiles ||
1407 	    (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1408 	    (kq_fp->f_type != DTYPE_KQUEUE)) {
1409 		error = EBADF;
1410 		goto aqueue_fail;
1411 	}
1412 	kq = (struct kqueue *)kq_fp->f_data;
1413 	kev.ident = (uintptr_t)aiocbe;
1414 	kev.filter = EVFILT_AIO;
1415 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1416 	error = kqueue_register(kq, &kev, td);
1417 aqueue_fail:
1418 	if (error) {
1419 		zfree(aiocb_zone, aiocbe);
1420 		if (type == 0)
1421 			suword(&job->_aiocb_private.error, error);
1422 		goto done;
1423 	}
1424 no_kqueue:
1425 
1426 	suword(&job->_aiocb_private.error, EINPROGRESS);
1427 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1428 	aiocbe->userproc = p;
1429 	aiocbe->jobflags = 0;
1430 	aiocbe->lio = lj;
1431 	ki = p->p_aioinfo;
1432 
1433 	if (fp->f_type == DTYPE_SOCKET) {
1434 		/*
1435 		 * Alternate queueing for socket ops: Reach down into the
1436 		 * descriptor to get the socket data.  Then check to see if the
1437 		 * socket is ready to be read or written (based on the requested
1438 		 * operation).
1439 		 *
1440 		 * If it is not ready for io, then queue the aiocbe on the
1441 		 * socket, and set the flags so we get a call when sbnotify()
1442 		 * happens.
1443 		 */
1444 		so = (struct socket *)fp->f_data;
1445 		s = splnet();
1446 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1447 		    LIO_WRITE) && (!sowriteable(so)))) {
1448 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1449 			TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1450 			if (opcode == LIO_READ)
1451 				so->so_rcv.sb_flags |= SB_AIO;
1452 			else
1453 				so->so_snd.sb_flags |= SB_AIO;
1454 			aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1455 			ki->kaio_queue_count++;
1456 			num_queue_count++;
1457 			splx(s);
1458 			error = 0;
1459 			goto done;
1460 		}
1461 		splx(s);
1462 	}
1463 
1464 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1465 		goto done;
1466 	if (error > 0) {
1467 		suword(&job->_aiocb_private.status, 0);
1468 		aiocbe->uaiocb._aiocb_private.error = error;
1469 		suword(&job->_aiocb_private.error, error);
1470 		goto done;
1471 	}
1472 
1473 	/* No buffer for daemon I/O. */
1474 	aiocbe->bp = NULL;
1475 
1476 	ki->kaio_queue_count++;
1477 	if (lj)
1478 		lj->lioj_queue_count++;
1479 	s = splnet();
1480 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1481 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1482 	splx(s);
1483 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1484 
1485 	num_queue_count++;
1486 	error = 0;
1487 
1488 	/*
1489 	 * If we don't have a free AIO process, and we are below our quota, then
1490 	 * start one.  Otherwise, depend on the subsequent I/O completions to
1491 	 * pick-up this job.  If we don't sucessfully create the new process
1492 	 * (thread) due to resource issues, we return an error for now (EAGAIN),
1493 	 * which is likely not the correct thing to do.
1494 	 */
1495 retryproc:
1496 	s = splnet();
1497 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1498 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1499 		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1500 		aiop->aiothreadflags &= ~AIOP_FREE;
1501 		wakeup(aiop->aiothread);
1502 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1503 	    ((ki->kaio_active_count + num_aio_resv_start) <
1504 	    ki->kaio_maxactive_count)) {
1505 		num_aio_resv_start++;
1506 		if ((error = aio_newproc()) == 0) {
1507 			num_aio_resv_start--;
1508 			td->td_retval[0] = 0;
1509 			goto retryproc;
1510 		}
1511 		num_aio_resv_start--;
1512 	}
1513 	splx(s);
1514 done:
1515 	fdrop(fp, td);
1516 	return error;
1517 }
1518 
1519 /*
1520  * This routine queues an AIO request, checking for quotas.
1521  */
1522 static int
1523 aio_aqueue(struct thread *td, struct aiocb *job, int type)
1524 {
1525 	struct proc *p = td->td_proc;
1526 	struct kaioinfo *ki;
1527 
1528 	if (p->p_aioinfo == NULL)
1529 		aio_init_aioinfo(p);
1530 
1531 	if (num_queue_count >= max_queue_count)
1532 		return EAGAIN;
1533 
1534 	ki = p->p_aioinfo;
1535 	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1536 		return EAGAIN;
1537 
1538 	return _aio_aqueue(td, job, NULL, type);
1539 }
1540 
1541 /*
1542  * Support the aio_return system call, as a side-effect, kernel resources are
1543  * released.
1544  */
1545 int
1546 aio_return(struct thread *td, struct aio_return_args *uap)
1547 {
1548 	struct proc *p = td->td_proc;
1549 	int s;
1550 	int jobref;
1551 	struct aiocblist *cb, *ncb;
1552 	struct aiocb *ujob;
1553 	struct kaioinfo *ki;
1554 
1555 	ki = p->p_aioinfo;
1556 	if (ki == NULL)
1557 		return EINVAL;
1558 
1559 	ujob = uap->aiocbp;
1560 
1561 	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1562 	if (jobref == -1 || jobref == 0)
1563 		return EINVAL;
1564 
1565 	s = splnet();
1566 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1567 	    plist)) {
1568 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1569 		    jobref) {
1570 			splx(s);
1571 			if (ujob == cb->uuaiocb) {
1572 				td->td_retval[0] =
1573 				    cb->uaiocb._aiocb_private.status;
1574 			} else
1575 				td->td_retval[0] = EFAULT;
1576 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1577 				p->p_stats->p_ru.ru_oublock +=
1578 				    cb->outputcharge;
1579 				cb->outputcharge = 0;
1580 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1581 				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
1582 				cb->inputcharge = 0;
1583 			}
1584 			aio_free_entry(cb);
1585 			return 0;
1586 		}
1587 	}
1588 	splx(s);
1589 
1590 	s = splbio();
1591 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1592 		ncb = TAILQ_NEXT(cb, plist);
1593 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1594 		    == jobref) {
1595 			splx(s);
1596 			if (ujob == cb->uuaiocb) {
1597 				td->td_retval[0] =
1598 				    cb->uaiocb._aiocb_private.status;
1599 			} else
1600 				td->td_retval[0] = EFAULT;
1601 			aio_free_entry(cb);
1602 			return 0;
1603 		}
1604 	}
1605 	splx(s);
1606 
1607 	return (EINVAL);
1608 }
1609 
1610 /*
1611  * Allow a process to wakeup when any of the I/O requests are completed.
1612  */
1613 int
1614 aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1615 {
1616 	struct proc *p = td->td_proc;
1617 	struct timeval atv;
1618 	struct timespec ts;
1619 	struct aiocb *const *cbptr, *cbp;
1620 	struct kaioinfo *ki;
1621 	struct aiocblist *cb;
1622 	int i;
1623 	int njoblist;
1624 	int error, s, timo;
1625 	int *ijoblist;
1626 	struct aiocb **ujoblist;
1627 
1628 	if (uap->nent > AIO_LISTIO_MAX)
1629 		return EINVAL;
1630 
1631 	timo = 0;
1632 	if (uap->timeout) {
1633 		/* Get timespec struct. */
1634 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1635 			return error;
1636 
1637 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1638 			return (EINVAL);
1639 
1640 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
1641 		if (itimerfix(&atv))
1642 			return (EINVAL);
1643 		timo = tvtohz(&atv);
1644 	}
1645 
1646 	ki = p->p_aioinfo;
1647 	if (ki == NULL)
1648 		return EAGAIN;
1649 
1650 	njoblist = 0;
1651 	ijoblist = zalloc(aiol_zone);
1652 	ujoblist = zalloc(aiol_zone);
1653 	cbptr = uap->aiocbp;
1654 
1655 	for (i = 0; i < uap->nent; i++) {
1656 		cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
1657 		if (cbp == 0)
1658 			continue;
1659 		ujoblist[njoblist] = cbp;
1660 		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1661 		njoblist++;
1662 	}
1663 
1664 	if (njoblist == 0) {
1665 		zfree(aiol_zone, ijoblist);
1666 		zfree(aiol_zone, ujoblist);
1667 		return 0;
1668 	}
1669 
1670 	error = 0;
1671 	for (;;) {
1672 		for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb =
1673 		    TAILQ_NEXT(cb, plist)) {
1674 			for (i = 0; i < njoblist; i++) {
1675 				if (((intptr_t)
1676 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1677 				    ijoblist[i]) {
1678 					if (ujoblist[i] != cb->uuaiocb)
1679 						error = EINVAL;
1680 					zfree(aiol_zone, ijoblist);
1681 					zfree(aiol_zone, ujoblist);
1682 					return error;
1683 				}
1684 			}
1685 		}
1686 
1687 		s = splbio();
1688 		for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1689 		    TAILQ_NEXT(cb, plist)) {
1690 			for (i = 0; i < njoblist; i++) {
1691 				if (((intptr_t)
1692 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1693 				    ijoblist[i]) {
1694 					splx(s);
1695 					if (ujoblist[i] != cb->uuaiocb)
1696 						error = EINVAL;
1697 					zfree(aiol_zone, ijoblist);
1698 					zfree(aiol_zone, ujoblist);
1699 					return error;
1700 				}
1701 			}
1702 		}
1703 
1704 		ki->kaio_flags |= KAIO_WAKEUP;
1705 		error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1706 		splx(s);
1707 
1708 		if (error == ERESTART || error == EINTR) {
1709 			zfree(aiol_zone, ijoblist);
1710 			zfree(aiol_zone, ujoblist);
1711 			return EINTR;
1712 		} else if (error == EWOULDBLOCK) {
1713 			zfree(aiol_zone, ijoblist);
1714 			zfree(aiol_zone, ujoblist);
1715 			return EAGAIN;
1716 		}
1717 	}
1718 
1719 /* NOTREACHED */
1720 	return EINVAL;
1721 }
1722 
1723 /*
1724  * aio_cancel cancels any non-physio aio operations not currently in
1725  * progress.
1726  */
1727 int
1728 aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1729 {
1730 	struct proc *p = td->td_proc;
1731 	struct kaioinfo *ki;
1732 	struct aiocblist *cbe, *cbn;
1733 	struct file *fp;
1734 	struct filedesc *fdp;
1735 	struct socket *so;
1736 	struct proc *po;
1737 	int s,error;
1738 	int cancelled=0;
1739 	int notcancelled=0;
1740 	struct vnode *vp;
1741 
1742 	fdp = p->p_fd;
1743 
1744 	fp = fdp->fd_ofiles[uap->fd];
1745 
1746 	if (fp == NULL) {
1747 		return EBADF;
1748 	}
1749 
1750         if (fp->f_type == DTYPE_VNODE) {
1751 		vp = (struct vnode *)fp->f_data;
1752 
1753 		if (vn_isdisk(vp,&error)) {
1754 			td->td_retval[0] = AIO_NOTCANCELED;
1755         	        return 0;
1756 		}
1757 	} else if (fp->f_type == DTYPE_SOCKET) {
1758 		so = (struct socket *)fp->f_data;
1759 
1760 		s = splnet();
1761 
1762 		for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1763 			cbn = TAILQ_NEXT(cbe, list);
1764 			if ((uap->aiocbp == NULL) ||
1765 				(uap->aiocbp == cbe->uuaiocb) ) {
1766 				po = cbe->userproc;
1767 				ki = po->p_aioinfo;
1768 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1769 				TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1770 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1771 				if (ki->kaio_flags & KAIO_WAKEUP) {
1772 					wakeup(po);
1773 				}
1774 				cbe->jobstate = JOBST_JOBFINISHED;
1775 				cbe->uaiocb._aiocb_private.status=-1;
1776 				cbe->uaiocb._aiocb_private.error=ECANCELED;
1777 				cancelled++;
1778 /* XXX cancelled, knote? */
1779 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1780 				    SIGEV_SIGNAL) {
1781 					PROC_LOCK(cbe->userproc);
1782 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1783 					PROC_UNLOCK(cbe->userproc);
1784 				}
1785 				if (uap->aiocbp)
1786 					break;
1787 			}
1788 		}
1789 
1790 		splx(s);
1791 
1792 		if ((cancelled) && (uap->aiocbp)) {
1793 			td->td_retval[0] = AIO_CANCELED;
1794 			return 0;
1795 		}
1796 
1797 	}
1798 
1799 	ki=p->p_aioinfo;
1800 
1801 	s = splnet();
1802 
1803 	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1804 		cbn = TAILQ_NEXT(cbe, plist);
1805 
1806 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1807 		    ((uap->aiocbp == NULL ) ||
1808 		     (uap->aiocbp == cbe->uuaiocb))) {
1809 
1810 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1811 				TAILQ_REMOVE(&aio_jobs, cbe, list);
1812                                 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1813                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1814                                     plist);
1815 				cancelled++;
1816 				ki->kaio_queue_finished_count++;
1817 				cbe->jobstate = JOBST_JOBFINISHED;
1818 				cbe->uaiocb._aiocb_private.status = -1;
1819 				cbe->uaiocb._aiocb_private.error = ECANCELED;
1820 /* XXX cancelled, knote? */
1821 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1822 				    SIGEV_SIGNAL) {
1823 					PROC_LOCK(cbe->userproc);
1824 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1825 					PROC_UNLOCK(cbe->userproc);
1826 				}
1827 			} else {
1828 				notcancelled++;
1829 			}
1830 		}
1831 	}
1832 
1833 	splx(s);
1834 
1835 
1836 	if (notcancelled) {
1837 		td->td_retval[0] = AIO_NOTCANCELED;
1838 		return 0;
1839 	}
1840 
1841 	if (cancelled) {
1842 		td->td_retval[0] = AIO_CANCELED;
1843 		return 0;
1844 	}
1845 
1846 	td->td_retval[0] = AIO_ALLDONE;
1847 
1848 	return 0;
1849 }
1850 
1851 /*
1852  * aio_error is implemented in the kernel level for compatibility purposes only.
1853  * For a user mode async implementation, it would be best to do it in a userland
1854  * subroutine.
1855  */
1856 int
1857 aio_error(struct thread *td, struct aio_error_args *uap)
1858 {
1859 	struct proc *p = td->td_proc;
1860 	int s;
1861 	struct aiocblist *cb;
1862 	struct kaioinfo *ki;
1863 	int jobref;
1864 
1865 	ki = p->p_aioinfo;
1866 	if (ki == NULL)
1867 		return EINVAL;
1868 
1869 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1870 	if ((jobref == -1) || (jobref == 0))
1871 		return EINVAL;
1872 
1873 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1874 	    plist)) {
1875 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1876 		    jobref) {
1877 			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1878 			return 0;
1879 		}
1880 	}
1881 
1882 	s = splnet();
1883 
1884 	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1885 	    plist)) {
1886 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1887 		    jobref) {
1888 			td->td_retval[0] = EINPROGRESS;
1889 			splx(s);
1890 			return 0;
1891 		}
1892 	}
1893 
1894 	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1895 	    plist)) {
1896 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1897 		    jobref) {
1898 			td->td_retval[0] = EINPROGRESS;
1899 			splx(s);
1900 			return 0;
1901 		}
1902 	}
1903 	splx(s);
1904 
1905 	s = splbio();
1906 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1907 	    plist)) {
1908 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1909 		    jobref) {
1910 			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1911 			splx(s);
1912 			return 0;
1913 		}
1914 	}
1915 
1916 	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1917 	    plist)) {
1918 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1919 		    jobref) {
1920 			td->td_retval[0] = EINPROGRESS;
1921 			splx(s);
1922 			return 0;
1923 		}
1924 	}
1925 	splx(s);
1926 
1927 #if (0)
1928 	/*
1929 	 * Hack for lio.
1930 	 */
1931 	status = fuword(&uap->aiocbp->_aiocb_private.status);
1932 	if (status == -1)
1933 		return fuword(&uap->aiocbp->_aiocb_private.error);
1934 #endif
1935 	return EINVAL;
1936 }
1937 
1938 int
1939 aio_read(struct thread *td, struct aio_read_args *uap)
1940 {
1941 
1942 	return aio_aqueue(td, uap->aiocbp, LIO_READ);
1943 }
1944 
1945 int
1946 aio_write(struct thread *td, struct aio_write_args *uap)
1947 {
1948 
1949 	return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
1950 }
1951 
1952 int
1953 lio_listio(struct thread *td, struct lio_listio_args *uap)
1954 {
1955 	struct proc *p = td->td_proc;
1956 	int nent, nentqueued;
1957 	struct aiocb *iocb, * const *cbptr;
1958 	struct aiocblist *cb;
1959 	struct kaioinfo *ki;
1960 	struct aio_liojob *lj;
1961 	int error, runningcode;
1962 	int nerror;
1963 	int i;
1964 	int s;
1965 
1966 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1967 		return EINVAL;
1968 
1969 	nent = uap->nent;
1970 	if (nent > AIO_LISTIO_MAX)
1971 		return EINVAL;
1972 
1973 	if (p->p_aioinfo == NULL)
1974 		aio_init_aioinfo(p);
1975 
1976 	if ((nent + num_queue_count) > max_queue_count)
1977 		return EAGAIN;
1978 
1979 	ki = p->p_aioinfo;
1980 	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1981 		return EAGAIN;
1982 
1983 	lj = zalloc(aiolio_zone);
1984 	if (!lj)
1985 		return EAGAIN;
1986 
1987 	lj->lioj_flags = 0;
1988 	lj->lioj_buffer_count = 0;
1989 	lj->lioj_buffer_finished_count = 0;
1990 	lj->lioj_queue_count = 0;
1991 	lj->lioj_queue_finished_count = 0;
1992 	lj->lioj_ki = ki;
1993 
1994 	/*
1995 	 * Setup signal.
1996 	 */
1997 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1998 		error = copyin(uap->sig, &lj->lioj_signal,
1999 			       sizeof(lj->lioj_signal));
2000 		if (error) {
2001 			zfree(aiolio_zone, lj);
2002 			return error;
2003 		}
2004 		if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2005 			zfree(aiolio_zone, lj);
2006 			return EINVAL;
2007 		}
2008 		lj->lioj_flags |= LIOJ_SIGNAL;
2009 		lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
2010 	} else
2011 		lj->lioj_flags &= ~LIOJ_SIGNAL;
2012 
2013 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2014 	/*
2015 	 * Get pointers to the list of I/O requests.
2016 	 */
2017 	nerror = 0;
2018 	nentqueued = 0;
2019 	cbptr = uap->acb_list;
2020 	for (i = 0; i < uap->nent; i++) {
2021 		iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2022 		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
2023 			error = _aio_aqueue(td, iocb, lj, 0);
2024 			if (error == 0)
2025 				nentqueued++;
2026 			else
2027 				nerror++;
2028 		}
2029 	}
2030 
2031 	/*
2032 	 * If we haven't queued any, then just return error.
2033 	 */
2034 	if (nentqueued == 0)
2035 		return 0;
2036 
2037 	/*
2038 	 * Calculate the appropriate error return.
2039 	 */
2040 	runningcode = 0;
2041 	if (nerror)
2042 		runningcode = EIO;
2043 
2044 	if (uap->mode == LIO_WAIT) {
2045 		int command, found, jobref;
2046 
2047 		for (;;) {
2048 			found = 0;
2049 			for (i = 0; i < uap->nent; i++) {
2050 				/*
2051 				 * Fetch address of the control buf pointer in
2052 				 * user space.
2053 				 */
2054 				iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2055 				if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2056 				    == 0))
2057 					continue;
2058 
2059 				/*
2060 				 * Fetch the associated command from user space.
2061 				 */
2062 				command = fuword(&iocb->aio_lio_opcode);
2063 				if (command == LIO_NOP) {
2064 					found++;
2065 					continue;
2066 				}
2067 
2068 				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2069 
2070 				TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2071 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2072 					    == jobref) {
2073 						if (cb->uaiocb.aio_lio_opcode
2074 						    == LIO_WRITE) {
2075 							p->p_stats->p_ru.ru_oublock
2076 							    +=
2077 							    cb->outputcharge;
2078 							cb->outputcharge = 0;
2079 						} else if (cb->uaiocb.aio_lio_opcode
2080 						    == LIO_READ) {
2081 							p->p_stats->p_ru.ru_inblock
2082 							    += cb->inputcharge;
2083 							cb->inputcharge = 0;
2084 						}
2085 						found++;
2086 						break;
2087 					}
2088 				}
2089 
2090 				s = splbio();
2091 				TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2092 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2093 					    == jobref) {
2094 						found++;
2095 						break;
2096 					}
2097 				}
2098 				splx(s);
2099 			}
2100 
2101 			/*
2102 			 * If all I/Os have been disposed of, then we can
2103 			 * return.
2104 			 */
2105 			if (found == nentqueued)
2106 				return runningcode;
2107 
2108 			ki->kaio_flags |= KAIO_WAKEUP;
2109 			error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2110 
2111 			if (error == EINTR)
2112 				return EINTR;
2113 			else if (error == EWOULDBLOCK)
2114 				return EAGAIN;
2115 		}
2116 	}
2117 
2118 	return runningcode;
2119 }
2120 
2121 /*
2122  * This is a weird hack so that we can post a signal.  It is safe to do so from
2123  * a timeout routine, but *not* from an interrupt routine.
2124  */
2125 static void
2126 process_signal(void *aioj)
2127 {
2128 	struct aiocblist *aiocbe = aioj;
2129 	struct aio_liojob *lj = aiocbe->lio;
2130 	struct aiocb *cb = &aiocbe->uaiocb;
2131 
2132 	if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2133 		(lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2134 		PROC_LOCK(lj->lioj_ki->kaio_p);
2135 		psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2136 		PROC_UNLOCK(lj->lioj_ki->kaio_p);
2137 		lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2138 	}
2139 
2140 	if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2141 		PROC_LOCK(aiocbe->userproc);
2142 		psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2143 		PROC_UNLOCK(aiocbe->userproc);
2144 	}
2145 }
2146 
2147 /*
2148  * Interrupt handler for physio, performs the necessary process wakeups, and
2149  * signals.
2150  */
2151 static void
2152 aio_physwakeup(struct buf *bp)
2153 {
2154 	struct aiocblist *aiocbe;
2155 	struct proc *p;
2156 	struct kaioinfo *ki;
2157 	struct aio_liojob *lj;
2158 
2159 	wakeup(bp);
2160 
2161 	aiocbe = (struct aiocblist *)bp->b_spc;
2162 	if (aiocbe) {
2163 		p = bp->b_caller1;
2164 
2165 		aiocbe->jobstate = JOBST_JOBBFINISHED;
2166 		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2167 		aiocbe->uaiocb._aiocb_private.error = 0;
2168 		aiocbe->jobflags |= AIOCBLIST_DONE;
2169 
2170 		if (bp->b_ioflags & BIO_ERROR)
2171 			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2172 
2173 		lj = aiocbe->lio;
2174 		if (lj) {
2175 			lj->lioj_buffer_finished_count++;
2176 
2177 			/*
2178 			 * wakeup/signal if all of the interrupt jobs are done.
2179 			 */
2180 			if (lj->lioj_buffer_finished_count ==
2181 			    lj->lioj_buffer_count) {
2182 				/*
2183 				 * Post a signal if it is called for.
2184 				 */
2185 				if ((lj->lioj_flags &
2186 				    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2187 				    LIOJ_SIGNAL) {
2188 					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2189 					aiocbe->timeouthandle =
2190 						timeout(process_signal,
2191 							aiocbe, 0);
2192 				}
2193 			}
2194 		}
2195 
2196 		ki = p->p_aioinfo;
2197 		if (ki) {
2198 			ki->kaio_buffer_finished_count++;
2199 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2200 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2201 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2202 
2203 			KNOTE(&aiocbe->klist, 0);
2204 			/* Do the wakeup. */
2205 			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2206 				ki->kaio_flags &= ~KAIO_WAKEUP;
2207 				wakeup(p);
2208 			}
2209 		}
2210 
2211 		if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2212 			aiocbe->timeouthandle =
2213 				timeout(process_signal, aiocbe, 0);
2214 	}
2215 }
2216 
2217 int
2218 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2219 {
2220 	struct proc *p = td->td_proc;
2221 	struct timeval atv;
2222 	struct timespec ts;
2223 	struct aiocb **cbptr;
2224 	struct kaioinfo *ki;
2225 	struct aiocblist *cb = NULL;
2226 	int error, s, timo;
2227 
2228 	suword(uap->aiocbp, (int)NULL);
2229 
2230 	timo = 0;
2231 	if (uap->timeout) {
2232 		/* Get timespec struct. */
2233 		error = copyin(uap->timeout, &ts, sizeof(ts));
2234 		if (error)
2235 			return error;
2236 
2237 		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2238 			return (EINVAL);
2239 
2240 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
2241 		if (itimerfix(&atv))
2242 			return (EINVAL);
2243 		timo = tvtohz(&atv);
2244 	}
2245 
2246 	ki = p->p_aioinfo;
2247 	if (ki == NULL)
2248 		return EAGAIN;
2249 
2250 	cbptr = uap->aiocbp;
2251 
2252 	for (;;) {
2253 		if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2254 			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2255 			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2256 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2257 				p->p_stats->p_ru.ru_oublock +=
2258 				    cb->outputcharge;
2259 				cb->outputcharge = 0;
2260 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2261 				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
2262 				cb->inputcharge = 0;
2263 			}
2264 			aio_free_entry(cb);
2265 			return cb->uaiocb._aiocb_private.error;
2266 		}
2267 
2268 		s = splbio();
2269  		if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2270 			splx(s);
2271 			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2272 			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2273 			aio_free_entry(cb);
2274 			return cb->uaiocb._aiocb_private.error;
2275 		}
2276 
2277 		ki->kaio_flags |= KAIO_WAKEUP;
2278 		error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2279 		splx(s);
2280 
2281 		if (error == ERESTART)
2282 			return EINTR;
2283 		else if (error < 0)
2284 			return error;
2285 		else if (error == EINTR)
2286 			return EINTR;
2287 		else if (error == EWOULDBLOCK)
2288 			return EAGAIN;
2289 	}
2290 }
2291 
2292 static int
2293 filt_aioattach(struct knote *kn)
2294 {
2295 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2296 
2297 	/*
2298 	 * The aiocbe pointer must be validated before using it, so
2299 	 * registration is restricted to the kernel; the user cannot
2300 	 * set EV_FLAG1.
2301 	 */
2302 	if ((kn->kn_flags & EV_FLAG1) == 0)
2303 		return (EPERM);
2304 	kn->kn_flags &= ~EV_FLAG1;
2305 
2306 	SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2307 
2308 	return (0);
2309 }
2310 
2311 static void
2312 filt_aiodetach(struct knote *kn)
2313 {
2314 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2315 
2316 	SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2317 }
2318 
2319 /*ARGSUSED*/
2320 static int
2321 filt_aio(struct knote *kn, long hint)
2322 {
2323 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2324 
2325 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2326 	if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2327 	    aiocbe->jobstate != JOBST_JOBBFINISHED)
2328 		return (0);
2329 	kn->kn_flags |= EV_EOF;
2330 	return (1);
2331 }
2332