xref: /freebsd/sys/kern/vfs_aio.c (revision 77a0943ded95b9e6438f7db70c4a28e4d93946d4)
1 /*
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  *
16  * $FreeBSD$
17  */
18 
19 /*
20  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21  */
22 
23 #include <sys/param.h>
24 #include <sys/systm.h>
25 #include <sys/bio.h>
26 #include <sys/buf.h>
27 #include <sys/sysproto.h>
28 #include <sys/filedesc.h>
29 #include <sys/kernel.h>
30 #include <sys/fcntl.h>
31 #include <sys/file.h>
32 #include <sys/lock.h>
33 #include <sys/mutex.h>
34 #include <sys/unistd.h>
35 #include <sys/proc.h>
36 #include <sys/resourcevar.h>
37 #include <sys/signalvar.h>
38 #include <sys/protosw.h>
39 #include <sys/socketvar.h>
40 #include <sys/sysctl.h>
41 #include <sys/vnode.h>
42 #include <sys/conf.h>
43 #include <sys/event.h>
44 
45 #include <vm/vm.h>
46 #include <vm/vm_extern.h>
47 #include <vm/pmap.h>
48 #include <vm/vm_map.h>
49 #include <vm/vm_zone.h>
50 #include <sys/aio.h>
51 
52 #include <machine/limits.h>
53 
54 #include "opt_vfs_aio.h"
55 
56 #ifdef VFS_AIO
57 
58 static	long jobrefid;
59 
60 #define JOBST_NULL		0x0
61 #define	JOBST_JOBQPROC		0x1
62 #define JOBST_JOBQGLOBAL	0x2
63 #define JOBST_JOBRUNNING	0x3
64 #define JOBST_JOBFINISHED	0x4
65 #define	JOBST_JOBQBUF		0x5
66 #define	JOBST_JOBBFINISHED	0x6
67 
68 #ifndef MAX_AIO_PER_PROC
69 #define MAX_AIO_PER_PROC	32
70 #endif
71 
72 #ifndef MAX_AIO_QUEUE_PER_PROC
73 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
74 #endif
75 
76 #ifndef MAX_AIO_PROCS
77 #define MAX_AIO_PROCS		32
78 #endif
79 
80 #ifndef MAX_AIO_QUEUE
81 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
82 #endif
83 
84 #ifndef TARGET_AIO_PROCS
85 #define TARGET_AIO_PROCS	4
86 #endif
87 
88 #ifndef MAX_BUF_AIO
89 #define MAX_BUF_AIO		16
90 #endif
91 
92 #ifndef AIOD_TIMEOUT_DEFAULT
93 #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
94 #endif
95 
96 #ifndef AIOD_LIFETIME_DEFAULT
97 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
98 #endif
99 
100 static int max_aio_procs = MAX_AIO_PROCS;
101 static int num_aio_procs = 0;
102 static int target_aio_procs = TARGET_AIO_PROCS;
103 static int max_queue_count = MAX_AIO_QUEUE;
104 static int num_queue_count = 0;
105 static int num_buf_aio = 0;
106 static int num_aio_resv_start = 0;
107 static int aiod_timeout;
108 static int aiod_lifetime;
109 
110 static int max_aio_per_proc = MAX_AIO_PER_PROC;
111 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
112 static int max_buf_aio = MAX_BUF_AIO;
113 
114 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
115 
116 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
117 	CTLFLAG_RW, &max_aio_per_proc, 0, "");
118 
119 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
120 	CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
121 
122 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
123 	CTLFLAG_RW, &max_aio_procs, 0, "");
124 
125 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
126 	CTLFLAG_RD, &num_aio_procs, 0, "");
127 
128 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
129 	CTLFLAG_RD, &num_queue_count, 0, "");
130 
131 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
132 	CTLFLAG_RW, &max_queue_count, 0, "");
133 
134 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
135 	CTLFLAG_RW, &target_aio_procs, 0, "");
136 
137 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
138 	CTLFLAG_RW, &max_buf_aio, 0, "");
139 
140 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
141 	CTLFLAG_RD, &num_buf_aio, 0, "");
142 
143 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
144 	CTLFLAG_RW, &aiod_lifetime, 0, "");
145 
146 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
147 	CTLFLAG_RW, &aiod_timeout, 0, "");
148 
149 /*
150  * AIO process info
151  */
152 #define AIOP_FREE	0x1			/* proc on free queue */
153 #define AIOP_SCHED	0x2			/* proc explicitly scheduled */
154 
155 struct aioproclist {
156 	int aioprocflags;			/* AIO proc flags */
157 	TAILQ_ENTRY(aioproclist) list;		/* List of processes */
158 	struct proc *aioproc;			/* The AIO thread */
159 	TAILQ_HEAD (,aiocblist) jobtorun;	/* suggested job to run */
160 };
161 
162 /*
163  * data-structure for lio signal management
164  */
165 struct aio_liojob {
166 	int	lioj_flags;
167 	int	lioj_buffer_count;
168 	int	lioj_buffer_finished_count;
169 	int	lioj_queue_count;
170 	int	lioj_queue_finished_count;
171 	struct	sigevent lioj_signal;	/* signal on all I/O done */
172 	TAILQ_ENTRY	(aio_liojob) lioj_list;
173 	struct	kaioinfo *lioj_ki;
174 };
175 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
176 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
177 
178 /*
179  * per process aio data structure
180  */
181 struct kaioinfo {
182 	int	kaio_flags;		/* per process kaio flags */
183 	int	kaio_maxactive_count;	/* maximum number of AIOs */
184 	int	kaio_active_count;	/* number of currently used AIOs */
185 	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
186 	int	kaio_queue_count;	/* size of AIO queue */
187 	int	kaio_ballowed_count;	/* maximum number of buffers */
188 	int	kaio_queue_finished_count; /* number of daemon jobs finished */
189 	int	kaio_buffer_count;	/* number of physio buffers */
190 	int	kaio_buffer_finished_count; /* count of I/O done */
191 	struct 	proc *kaio_p;		/* process that uses this kaio block */
192 	TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */
193 	TAILQ_HEAD (,aiocblist)	kaio_jobqueue;	/* job queue for process */
194 	TAILQ_HEAD (,aiocblist)	kaio_jobdone;	/* done queue for process */
195 	TAILQ_HEAD (,aiocblist)	kaio_bufqueue;	/* buffer job queue for process */
196 	TAILQ_HEAD (,aiocblist)	kaio_bufdone;	/* buffer done queue for process */
197 	TAILQ_HEAD (,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
198 };
199 
200 #define KAIO_RUNDOWN	0x1	/* process is being run down */
201 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
202 
203 static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc;
204 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
205 static TAILQ_HEAD(,aiocblist) aio_bufjobs;		/* Phys I/O job list */
206 static TAILQ_HEAD(,aiocblist) aio_freejobs;		/* Pool of free jobs */
207 
208 static void	aio_init_aioinfo(struct proc *p);
209 static void	aio_onceonly(void *);
210 static int	aio_free_entry(struct aiocblist *aiocbe);
211 static void	aio_process(struct aiocblist *aiocbe);
212 static int	aio_newproc(void);
213 static int	aio_aqueue(struct proc *p, struct aiocb *job, int type);
214 static void	aio_physwakeup(struct buf *bp);
215 static int	aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
216 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
217 static void	aio_daemon(void *uproc);
218 
219 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
220 
221 static vm_zone_t kaio_zone = 0, aiop_zone = 0, aiocb_zone = 0, aiol_zone = 0;
222 static vm_zone_t aiolio_zone = 0;
223 
224 /*
225  * Startup initialization
226  */
227 void
228 aio_onceonly(void *na)
229 {
230 	TAILQ_INIT(&aio_freeproc);
231 	TAILQ_INIT(&aio_activeproc);
232 	TAILQ_INIT(&aio_jobs);
233 	TAILQ_INIT(&aio_bufjobs);
234 	TAILQ_INIT(&aio_freejobs);
235 	kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
236 	aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
237 	aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
238 	aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
239 	aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct
240 	    aio_liojob), 0, 0, 1);
241 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
242 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
243 	jobrefid = 1;
244 }
245 
246 /*
247  * Init the per-process aioinfo structure.  The aioinfo limits are set
248  * per-process for user limit (resource) management.
249  */
250 void
251 aio_init_aioinfo(struct proc *p)
252 {
253 	struct kaioinfo *ki;
254 	if (p->p_aioinfo == NULL) {
255 		ki = zalloc(kaio_zone);
256 		p->p_aioinfo = ki;
257 		ki->kaio_flags = 0;
258 		ki->kaio_maxactive_count = max_aio_per_proc;
259 		ki->kaio_active_count = 0;
260 		ki->kaio_qallowed_count = max_aio_queue_per_proc;
261 		ki->kaio_queue_count = 0;
262 		ki->kaio_ballowed_count = max_buf_aio;
263 		ki->kaio_buffer_count = 0;
264 		ki->kaio_buffer_finished_count = 0;
265 		ki->kaio_p = p;
266 		TAILQ_INIT(&ki->kaio_jobdone);
267 		TAILQ_INIT(&ki->kaio_jobqueue);
268 		TAILQ_INIT(&ki->kaio_bufdone);
269 		TAILQ_INIT(&ki->kaio_bufqueue);
270 		TAILQ_INIT(&ki->kaio_liojoblist);
271 		TAILQ_INIT(&ki->kaio_sockqueue);
272 	}
273 
274 	while (num_aio_procs < target_aio_procs)
275 		aio_newproc();
276 }
277 
278 /*
279  * Free a job entry.  Wait for completion if it is currently active, but don't
280  * delay forever.  If we delay, we return a flag that says that we have to
281  * restart the queue scan.
282  */
283 int
284 aio_free_entry(struct aiocblist *aiocbe)
285 {
286 	struct kaioinfo *ki;
287 	struct aioproclist *aiop;
288 	struct aio_liojob *lj;
289 	struct proc *p;
290 	int error;
291 	int s;
292 
293 	if (aiocbe->jobstate == JOBST_NULL)
294 		panic("aio_free_entry: freeing already free job");
295 
296 	p = aiocbe->userproc;
297 	ki = p->p_aioinfo;
298 	lj = aiocbe->lio;
299 	if (ki == NULL)
300 		panic("aio_free_entry: missing p->p_aioinfo");
301 
302 	if (aiocbe->jobstate == JOBST_JOBRUNNING) {
303 		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
304 			return 0;
305 		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
306 		tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0);
307 	}
308 	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
309 
310 	if (aiocbe->bp == NULL) {
311 		if (ki->kaio_queue_count <= 0)
312 			panic("aio_free_entry: process queue size <= 0");
313 		if (num_queue_count <= 0)
314 			panic("aio_free_entry: system wide queue size <= 0");
315 
316 		if (lj) {
317 			lj->lioj_queue_count--;
318 			if (aiocbe->jobflags & AIOCBLIST_DONE)
319 				lj->lioj_queue_finished_count--;
320 		}
321 		ki->kaio_queue_count--;
322 		if (aiocbe->jobflags & AIOCBLIST_DONE)
323 			ki->kaio_queue_finished_count--;
324 		num_queue_count--;
325 	} else {
326 		if (lj) {
327 			lj->lioj_buffer_count--;
328 			if (aiocbe->jobflags & AIOCBLIST_DONE)
329 				lj->lioj_buffer_finished_count--;
330 		}
331 		if (aiocbe->jobflags & AIOCBLIST_DONE)
332 			ki->kaio_buffer_finished_count--;
333 		ki->kaio_buffer_count--;
334 		num_buf_aio--;
335 	}
336 
337 	/* aiocbe is going away, we need to destroy any knotes */
338 	knote_remove(p, &aiocbe->klist);
339 
340 	if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
341 	    && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
342 		ki->kaio_flags &= ~KAIO_WAKEUP;
343 		wakeup(p);
344 	}
345 
346 	if (aiocbe->jobstate == JOBST_JOBQBUF) {
347 		if ((error = aio_fphysio(p, aiocbe, 1)) != 0)
348 			return error;
349 		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
350 			panic("aio_free_entry: invalid physio finish-up state");
351 		s = splbio();
352 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
353 		splx(s);
354 	} else if (aiocbe->jobstate == JOBST_JOBQPROC) {
355 		aiop = aiocbe->jobaioproc;
356 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
357 	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL)
358 		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
359 	else if (aiocbe->jobstate == JOBST_JOBFINISHED)
360 		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
361 	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
362 		s = splbio();
363 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
364 		splx(s);
365 		if (aiocbe->bp) {
366 			vunmapbuf(aiocbe->bp);
367 			relpbuf(aiocbe->bp, NULL);
368 			aiocbe->bp = NULL;
369 		}
370 	}
371 	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
372 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
373 		zfree(aiolio_zone, lj);
374 	}
375 	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
376 	aiocbe->jobstate = JOBST_NULL;
377 	return 0;
378 }
379 #endif /* VFS_AIO */
380 
381 /*
382  * Rundown the jobs for a given process.
383  */
384 void
385 aio_proc_rundown(struct proc *p)
386 {
387 #ifndef VFS_AIO
388 	return;
389 #else
390 	int s;
391 	struct kaioinfo *ki;
392 	struct aio_liojob *lj, *ljn;
393 	struct aiocblist *aiocbe, *aiocbn;
394 	struct file *fp;
395 	struct filedesc *fdp;
396 	struct socket *so;
397 
398 	ki = p->p_aioinfo;
399 	if (ki == NULL)
400 		return;
401 
402 	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
403 	while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
404 	    ki->kaio_buffer_finished_count)) {
405 		ki->kaio_flags |= KAIO_RUNDOWN;
406 		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
407 			break;
408 	}
409 
410 	/*
411 	 * Move any aio ops that are waiting on socket I/O to the normal job
412 	 * queues so they are cleaned up with any others.
413 	 */
414 	fdp = p->p_fd;
415 
416 	s = splnet();
417 	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
418 	    aiocbn) {
419 		aiocbn = TAILQ_NEXT(aiocbe, plist);
420 		fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes];
421 
422 		/*
423 		 * Under some circumstances, the aio_fildes and the file
424 		 * structure don't match.  This would leave aiocbe's in the
425 		 * TAILQ associated with the socket and cause a panic later.
426 		 *
427 		 * Detect and fix.
428 		 */
429 		if ((fp == NULL) || (fp != aiocbe->fd_file))
430 			fp = aiocbe->fd_file;
431 		if (fp) {
432 			so = (struct socket *)fp->f_data;
433 			TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
434 			if (TAILQ_EMPTY(&so->so_aiojobq)) {
435 				so->so_snd.sb_flags &= ~SB_AIO;
436 				so->so_rcv.sb_flags &= ~SB_AIO;
437 			}
438 		}
439 		TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
440 		TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
441 		TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
442 	}
443 	splx(s);
444 
445 restart1:
446 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
447 		aiocbn = TAILQ_NEXT(aiocbe, plist);
448 		if (aio_free_entry(aiocbe))
449 			goto restart1;
450 	}
451 
452 restart2:
453 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
454 	    aiocbn) {
455 		aiocbn = TAILQ_NEXT(aiocbe, plist);
456 		if (aio_free_entry(aiocbe))
457 			goto restart2;
458 	}
459 
460 /*
461  * Note the use of lots of splbio here, trying to avoid splbio for long chains
462  * of I/O.  Probably unnecessary.
463  */
464 restart3:
465 	s = splbio();
466 	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
467 		ki->kaio_flags |= KAIO_WAKEUP;
468 		tsleep(p, PRIBIO, "aioprn", 0);
469 		splx(s);
470 		goto restart3;
471 	}
472 	splx(s);
473 
474 restart4:
475 	s = splbio();
476 	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
477 		aiocbn = TAILQ_NEXT(aiocbe, plist);
478 		if (aio_free_entry(aiocbe)) {
479 			splx(s);
480 			goto restart4;
481 		}
482 	}
483 	splx(s);
484 
485 	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
486 		ljn = TAILQ_NEXT(lj, lioj_list);
487 		if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
488 		    0)) {
489 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
490 			zfree(aiolio_zone, lj);
491 		} else {
492 #ifdef DIAGNOSTIC
493 			printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
494 			    "QF:%d\n", lj->lioj_buffer_count,
495 			    lj->lioj_buffer_finished_count,
496 			    lj->lioj_queue_count,
497 			    lj->lioj_queue_finished_count);
498 #endif
499 		}
500 	}
501 
502 	zfree(kaio_zone, ki);
503 	p->p_aioinfo = NULL;
504 #endif /* VFS_AIO */
505 }
506 
507 #ifdef VFS_AIO
508 /*
509  * Select a job to run (called by an AIO daemon).
510  */
511 static struct aiocblist *
512 aio_selectjob(struct aioproclist *aiop)
513 {
514 	int s;
515 	struct aiocblist *aiocbe;
516 	struct kaioinfo *ki;
517 	struct proc *userp;
518 
519 	aiocbe = TAILQ_FIRST(&aiop->jobtorun);
520 	if (aiocbe) {
521 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
522 		return aiocbe;
523 	}
524 
525 	s = splnet();
526 	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
527 	    TAILQ_NEXT(aiocbe, list)) {
528 		userp = aiocbe->userproc;
529 		ki = userp->p_aioinfo;
530 
531 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
532 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
533 			splx(s);
534 			return aiocbe;
535 		}
536 	}
537 	splx(s);
538 
539 	return NULL;
540 }
541 
542 /*
543  * The AIO processing activity.  This is the code that does the I/O request for
544  * the non-physio version of the operations.  The normal vn operations are used,
545  * and this code should work in all instances for every type of file, including
546  * pipes, sockets, fifos, and regular files.
547  */
548 void
549 aio_process(struct aiocblist *aiocbe)
550 {
551 	struct filedesc *fdp;
552 	struct proc *userp, *mycp;
553 	struct aiocb *cb;
554 	struct file *fp;
555 	struct uio auio;
556 	struct iovec aiov;
557 	unsigned int fd;
558 	int cnt;
559 	int error;
560 	off_t offset;
561 	int oublock_st, oublock_end;
562 	int inblock_st, inblock_end;
563 
564 	userp = aiocbe->userproc;
565 	cb = &aiocbe->uaiocb;
566 
567 	mycp = curproc;
568 
569 	fdp = mycp->p_fd;
570 	fd = cb->aio_fildes;
571 	fp = fdp->fd_ofiles[fd];
572 
573 	if ((fp == NULL) || (fp != aiocbe->fd_file)) {
574 		cb->_aiocb_private.error = EBADF;
575 		cb->_aiocb_private.status = -1;
576 		return;
577 	}
578 
579 	aiov.iov_base = (void *)cb->aio_buf;
580 	aiov.iov_len = cb->aio_nbytes;
581 
582 	auio.uio_iov = &aiov;
583 	auio.uio_iovcnt = 1;
584 	auio.uio_offset = offset = cb->aio_offset;
585 	auio.uio_resid = cb->aio_nbytes;
586 	cnt = cb->aio_nbytes;
587 	auio.uio_segflg = UIO_USERSPACE;
588 	auio.uio_procp = mycp;
589 
590 	inblock_st = mycp->p_stats->p_ru.ru_inblock;
591 	oublock_st = mycp->p_stats->p_ru.ru_oublock;
592 	/*
593 	 * Temporarily bump the ref count while reading to avoid the
594 	 * descriptor being ripped out from under us.
595 	 */
596 	fhold(fp);
597 	if (cb->aio_lio_opcode == LIO_READ) {
598 		auio.uio_rw = UIO_READ;
599 		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
600 	} else {
601 		auio.uio_rw = UIO_WRITE;
602 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
603 	}
604 	fdrop(fp, mycp);
605 	inblock_end = mycp->p_stats->p_ru.ru_inblock;
606 	oublock_end = mycp->p_stats->p_ru.ru_oublock;
607 
608 	aiocbe->inputcharge = inblock_end - inblock_st;
609 	aiocbe->outputcharge = oublock_end - oublock_st;
610 
611 	if ((error) && (auio.uio_resid != cnt)) {
612 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
613 			error = 0;
614 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
615 			psignal(userp, SIGPIPE);
616 	}
617 
618 	cnt -= auio.uio_resid;
619 	cb->_aiocb_private.error = error;
620 	cb->_aiocb_private.status = cnt;
621 
622 	return;
623 }
624 
625 /*
626  * The AIO daemon, most of the actual work is done in aio_process,
627  * but the setup (and address space mgmt) is done in this routine.
628  */
629 static void
630 aio_daemon(void *uproc)
631 {
632 	int s;
633 	struct aio_liojob *lj;
634 	struct aiocb *cb;
635 	struct aiocblist *aiocbe;
636 	struct aioproclist *aiop;
637 	struct kaioinfo *ki;
638 	struct proc *curcp, *mycp, *userp;
639 	struct vmspace *myvm, *tmpvm;
640 
641 	mtx_enter(&Giant, MTX_DEF);
642 	/*
643 	 * Local copies of curproc (cp) and vmspace (myvm)
644 	 */
645 	mycp = curproc;
646 	myvm = mycp->p_vmspace;
647 
648 	if (mycp->p_textvp) {
649 		vrele(mycp->p_textvp);
650 		mycp->p_textvp = NULL;
651 	}
652 
653 	/*
654 	 * Allocate and ready the aio control info.  There is one aiop structure
655 	 * per daemon.
656 	 */
657 	aiop = zalloc(aiop_zone);
658 	aiop->aioproc = mycp;
659 	aiop->aioprocflags |= AIOP_FREE;
660 	TAILQ_INIT(&aiop->jobtorun);
661 
662 	s = splnet();
663 
664 	/*
665 	 * Place thread (lightweight process) onto the AIO free thread list.
666 	 */
667 	if (TAILQ_EMPTY(&aio_freeproc))
668 		wakeup(&aio_freeproc);
669 	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
670 
671 	splx(s);
672 
673 	/* Make up a name for the daemon. */
674 	strcpy(mycp->p_comm, "aiod");
675 
676 	/*
677 	 * Get rid of our current filedescriptors.  AIOD's don't need any
678 	 * filedescriptors, except as temporarily inherited from the client.
679 	 * Credentials are also cloned, and made equivalent to "root".
680 	 */
681 	fdfree(mycp);
682 	mycp->p_fd = NULL;
683 	mycp->p_ucred = crcopy(mycp->p_ucred);
684 	mycp->p_ucred->cr_uid = 0;
685 	uifree(mycp->p_ucred->cr_uidinfo);
686 	mycp->p_ucred->cr_uidinfo = uifind(0);
687 	mycp->p_ucred->cr_ngroups = 1;
688 	mycp->p_ucred->cr_groups[0] = 1;
689 
690 	/* The daemon resides in its own pgrp. */
691 	enterpgrp(mycp, mycp->p_pid, 1);
692 
693 	/* Mark special process type. */
694 	mycp->p_flag |= P_SYSTEM | P_KTHREADP;
695 
696 	/*
697 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
698 	 * creating to many daemons.)
699 	 */
700 	wakeup(mycp);
701 
702 	for (;;) {
703 		/*
704 		 * curcp is the current daemon process context.
705 		 * userp is the current user process context.
706 		 */
707 		curcp = mycp;
708 
709 		/*
710 		 * Take daemon off of free queue
711 		 */
712 		if (aiop->aioprocflags & AIOP_FREE) {
713 			s = splnet();
714 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
715 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
716 			aiop->aioprocflags &= ~AIOP_FREE;
717 			splx(s);
718 		}
719 		aiop->aioprocflags &= ~AIOP_SCHED;
720 
721 		/*
722 		 * Check for jobs.
723 		 */
724 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
725 			cb = &aiocbe->uaiocb;
726 			userp = aiocbe->userproc;
727 
728 			aiocbe->jobstate = JOBST_JOBRUNNING;
729 
730 			/*
731 			 * Connect to process address space for user program.
732 			 */
733 			if (userp != curcp) {
734 				/*
735 				 * Save the current address space that we are
736 				 * connected to.
737 				 */
738 				tmpvm = mycp->p_vmspace;
739 
740 				/*
741 				 * Point to the new user address space, and
742 				 * refer to it.
743 				 */
744 				mycp->p_vmspace = userp->p_vmspace;
745 				mycp->p_vmspace->vm_refcnt++;
746 
747 				/* Activate the new mapping. */
748 				pmap_activate(mycp);
749 
750 				/*
751 				 * If the old address space wasn't the daemons
752 				 * own address space, then we need to remove the
753 				 * daemon's reference from the other process
754 				 * that it was acting on behalf of.
755 				 */
756 				if (tmpvm != myvm) {
757 					vmspace_free(tmpvm);
758 				}
759 
760 				/*
761 				 * Disassociate from previous clients file
762 				 * descriptors, and associate to the new clients
763 				 * descriptors.  Note that the daemon doesn't
764 				 * need to worry about its orginal descriptors,
765 				 * because they were originally freed.
766 				 */
767 				if (mycp->p_fd)
768 					fdfree(mycp);
769 				mycp->p_fd = fdshare(userp);
770 				curcp = userp;
771 			}
772 
773 			ki = userp->p_aioinfo;
774 			lj = aiocbe->lio;
775 
776 			/* Account for currently active jobs. */
777 			ki->kaio_active_count++;
778 
779 			/* Do the I/O function. */
780 			aiocbe->jobaioproc = aiop;
781 			aio_process(aiocbe);
782 
783 			/* Decrement the active job count. */
784 			ki->kaio_active_count--;
785 
786 			/*
787 			 * Increment the completion count for wakeup/signal
788 			 * comparisons.
789 			 */
790 			aiocbe->jobflags |= AIOCBLIST_DONE;
791 			ki->kaio_queue_finished_count++;
792 			if (lj)
793 				lj->lioj_queue_finished_count++;
794 			if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
795 			    & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
796 				ki->kaio_flags &= ~KAIO_WAKEUP;
797 				wakeup(userp);
798 			}
799 
800 			s = splbio();
801 			if (lj && (lj->lioj_flags &
802 			    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
803 				if ((lj->lioj_queue_finished_count ==
804 				    lj->lioj_queue_count) &&
805 				    (lj->lioj_buffer_finished_count ==
806 				    lj->lioj_buffer_count)) {
807 						psignal(userp,
808 						    lj->lioj_signal.sigev_signo);
809 						lj->lioj_flags |=
810 						    LIOJ_SIGNAL_POSTED;
811 				}
812 			}
813 			splx(s);
814 
815 			aiocbe->jobstate = JOBST_JOBFINISHED;
816 
817 			/*
818 			 * If the I/O request should be automatically rundown,
819 			 * do the needed cleanup.  Otherwise, place the queue
820 			 * entry for the just finished I/O request into the done
821 			 * queue for the associated client.
822 			 */
823 			s = splnet();
824 			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
825 				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
826 				TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
827 			} else {
828 				TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
829 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
830 				    plist);
831 			}
832 			splx(s);
833 			KNOTE(&aiocbe->klist, 0);
834 
835 			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
836 				wakeup(aiocbe);
837 				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
838 			}
839 
840 			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
841 				psignal(userp, cb->aio_sigevent.sigev_signo);
842 			}
843 		}
844 
845 		/*
846 		 * Disconnect from user address space.
847 		 */
848 		if (curcp != mycp) {
849 			/* Get the user address space to disconnect from. */
850 			tmpvm = mycp->p_vmspace;
851 
852 			/* Get original address space for daemon. */
853 			mycp->p_vmspace = myvm;
854 
855 			/* Activate the daemon's address space. */
856 			pmap_activate(mycp);
857 #ifdef DIAGNOSTIC
858 			if (tmpvm == myvm) {
859 				printf("AIOD: vmspace problem -- %d\n",
860 				    mycp->p_pid);
861 			}
862 #endif
863 			/* Remove our vmspace reference. */
864 			vmspace_free(tmpvm);
865 
866 			/*
867 			 * Disassociate from the user process's file
868 			 * descriptors.
869 			 */
870 			if (mycp->p_fd)
871 				fdfree(mycp);
872 			mycp->p_fd = NULL;
873 			curcp = mycp;
874 		}
875 
876 		/*
877 		 * If we are the first to be put onto the free queue, wakeup
878 		 * anyone waiting for a daemon.
879 		 */
880 		s = splnet();
881 		TAILQ_REMOVE(&aio_activeproc, aiop, list);
882 		if (TAILQ_EMPTY(&aio_freeproc))
883 			wakeup(&aio_freeproc);
884 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
885 		aiop->aioprocflags |= AIOP_FREE;
886 		splx(s);
887 
888 		/*
889 		 * If daemon is inactive for a long time, allow it to exit,
890 		 * thereby freeing resources.
891 		 */
892 		if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp,
893 		    PRIBIO, "aiordy", aiod_lifetime)) {
894 			s = splnet();
895 			if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
896 			    (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
897 				if ((aiop->aioprocflags & AIOP_FREE) &&
898 				    (num_aio_procs > target_aio_procs)) {
899 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
900 					splx(s);
901 					zfree(aiop_zone, aiop);
902 					num_aio_procs--;
903 #ifdef DIAGNOSTIC
904 					if (mycp->p_vmspace->vm_refcnt <= 1) {
905 						printf("AIOD: bad vm refcnt for"
906 						    " exiting daemon: %d\n",
907 						    mycp->p_vmspace->vm_refcnt);
908 					}
909 #endif
910 					exit1(mycp, 0);
911 				}
912 			}
913 			splx(s);
914 		}
915 	}
916 }
917 
918 /*
919  * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
920  * AIO daemon modifies its environment itself.
921  */
922 static int
923 aio_newproc()
924 {
925 	int error;
926 	struct proc *p, *np;
927 
928 	p = &proc0;
929 	error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np);
930 	if (error)
931 		return error;
932 	cpu_set_fork_handler(np, aio_daemon, curproc);
933 
934 	/*
935 	 * Wait until daemon is started, but continue on just in case to
936 	 * handle error conditions.
937 	 */
938 	error = tsleep(np, PZERO, "aiosta", aiod_timeout);
939 	num_aio_procs++;
940 
941 	return error;
942 }
943 
944 /*
945  * Try the high-performance physio method for eligible VCHR devices.  This
946  * routine doesn't require the use of any additional threads, and have overhead.
947  */
948 int
949 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
950 {
951 	int error;
952 	struct aiocb *cb;
953 	struct file *fp;
954 	struct buf *bp;
955 	struct vnode *vp;
956 	struct kaioinfo *ki;
957 	struct filedesc *fdp;
958 	struct aio_liojob *lj;
959 	int fd;
960 	int s;
961 	int notify;
962 
963 	cb = &aiocbe->uaiocb;
964 	fdp = p->p_fd;
965 	fd = cb->aio_fildes;
966 	fp = fdp->fd_ofiles[fd];
967 
968 	if (fp->f_type != DTYPE_VNODE)
969 		return (-1);
970 
971 	vp = (struct vnode *)fp->f_data;
972 
973 	/*
974 	 * If its not a disk, we don't want to return a positive error.
975 	 * It causes the aio code to not fall through to try the thread
976 	 * way when you're talking to a regular file.
977 	 */
978 	if (!vn_isdisk(vp, &error)) {
979 		if (error == ENOTBLK)
980 			return (-1);
981 		else
982 			return (error);
983 	}
984 
985  	if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
986 		return (-1);
987 
988 	if (cb->aio_nbytes > MAXPHYS)
989 		return (-1);
990 
991 	ki = p->p_aioinfo;
992 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
993 		return (-1);
994 
995 	fhold(fp);
996 
997 	ki->kaio_buffer_count++;
998 
999 	lj = aiocbe->lio;
1000 	if (lj)
1001 		lj->lioj_buffer_count++;
1002 
1003 	/* Create and build a buffer header for a transfer. */
1004 	bp = (struct buf *)getpbuf(NULL);
1005 
1006 	/*
1007 	 * Get a copy of the kva from the physical buffer.
1008 	 */
1009 	bp->b_caller1 = p;
1010 	bp->b_dev = vp->v_rdev;
1011 	error = bp->b_error = 0;
1012 
1013 	bp->b_bcount = cb->aio_nbytes;
1014 	bp->b_bufsize = cb->aio_nbytes;
1015 	bp->b_flags = B_PHYS;
1016 	bp->b_iodone = aio_physwakeup;
1017 	bp->b_saveaddr = bp->b_data;
1018 	bp->b_data = (void *)cb->aio_buf;
1019 	bp->b_blkno = btodb(cb->aio_offset);
1020 
1021 	if (cb->aio_lio_opcode == LIO_WRITE) {
1022 		bp->b_iocmd = BIO_WRITE;
1023 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1024 			error = EFAULT;
1025 			goto doerror;
1026 		}
1027 	} else {
1028 		bp->b_iocmd = BIO_READ;
1029 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1030 			error = EFAULT;
1031 			goto doerror;
1032 		}
1033 	}
1034 
1035 	/* Bring buffer into kernel space. */
1036 	vmapbuf(bp);
1037 
1038 	s = splbio();
1039 	aiocbe->bp = bp;
1040 	bp->b_spc = (void *)aiocbe;
1041 	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1042 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1043 	aiocbe->jobstate = JOBST_JOBQBUF;
1044 	cb->_aiocb_private.status = cb->aio_nbytes;
1045 	num_buf_aio++;
1046 	bp->b_error = 0;
1047 
1048 	splx(s);
1049 
1050 	/* Perform transfer. */
1051 	DEV_STRATEGY(bp, 0);
1052 
1053 	notify = 0;
1054 	s = splbio();
1055 
1056 	/*
1057 	 * If we had an error invoking the request, or an error in processing
1058 	 * the request before we have returned, we process it as an error in
1059 	 * transfer.  Note that such an I/O error is not indicated immediately,
1060 	 * but is returned using the aio_error mechanism.  In this case,
1061 	 * aio_suspend will return immediately.
1062 	 */
1063 	if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1064 		struct aiocb *job = aiocbe->uuaiocb;
1065 
1066 		aiocbe->uaiocb._aiocb_private.status = 0;
1067 		suword(&job->_aiocb_private.status, 0);
1068 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1069 		suword(&job->_aiocb_private.error, bp->b_error);
1070 
1071 		ki->kaio_buffer_finished_count++;
1072 
1073 		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1074 			aiocbe->jobstate = JOBST_JOBBFINISHED;
1075 			aiocbe->jobflags |= AIOCBLIST_DONE;
1076 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1077 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1078 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1079 			notify = 1;
1080 		}
1081 	}
1082 	splx(s);
1083 	if (notify)
1084 		KNOTE(&aiocbe->klist, 0);
1085 	fdrop(fp, p);
1086 	return 0;
1087 
1088 doerror:
1089 	ki->kaio_buffer_count--;
1090 	if (lj)
1091 		lj->lioj_buffer_count--;
1092 	aiocbe->bp = NULL;
1093 	relpbuf(bp, NULL);
1094 	fdrop(fp, p);
1095 	return error;
1096 }
1097 
1098 /*
1099  * This waits/tests physio completion.
1100  */
1101 int
1102 aio_fphysio(struct proc *p, struct aiocblist *iocb, int flgwait)
1103 {
1104 	int s;
1105 	struct buf *bp;
1106 	int error;
1107 
1108 	bp = iocb->bp;
1109 
1110 	s = splbio();
1111 	if (flgwait == 0) {
1112 		if ((bp->b_flags & B_DONE) == 0) {
1113 			splx(s);
1114 			return EINPROGRESS;
1115 		}
1116 	}
1117 
1118 	while ((bp->b_flags & B_DONE) == 0) {
1119 		if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) {
1120 			if ((bp->b_flags & B_DONE) == 0) {
1121 				splx(s);
1122 				return EINPROGRESS;
1123 			} else
1124 				break;
1125 		}
1126 	}
1127 
1128 	/* Release mapping into kernel space. */
1129 	vunmapbuf(bp);
1130 	iocb->bp = 0;
1131 
1132 	error = 0;
1133 
1134 	/* Check for an error. */
1135 	if (bp->b_ioflags & BIO_ERROR)
1136 		error = bp->b_error;
1137 
1138 	relpbuf(bp, NULL);
1139 	return (error);
1140 }
1141 #endif /* VFS_AIO */
1142 
1143 /*
1144  * Wake up aio requests that may be serviceable now.
1145  */
1146 void
1147 aio_swake(struct socket *so, struct sockbuf *sb)
1148 {
1149 #ifndef VFS_AIO
1150 	return;
1151 #else
1152 	struct aiocblist *cb,*cbn;
1153 	struct proc *p;
1154 	struct kaioinfo *ki = NULL;
1155 	int opcode, wakecount = 0;
1156 	struct aioproclist *aiop;
1157 
1158 	if (sb == &so->so_snd) {
1159 		opcode = LIO_WRITE;
1160 		so->so_snd.sb_flags &= ~SB_AIO;
1161 	} else {
1162 		opcode = LIO_READ;
1163 		so->so_rcv.sb_flags &= ~SB_AIO;
1164 	}
1165 
1166 	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1167 		cbn = TAILQ_NEXT(cb, list);
1168 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1169 			p = cb->userproc;
1170 			ki = p->p_aioinfo;
1171 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1172 			TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1173 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1174 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1175 			wakecount++;
1176 			if (cb->jobstate != JOBST_JOBQGLOBAL)
1177 				panic("invalid queue value");
1178 		}
1179 	}
1180 
1181 	while (wakecount--) {
1182 		if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1183 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1184 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1185 			aiop->aioprocflags &= ~AIOP_FREE;
1186 			wakeup(aiop->aioproc);
1187 		}
1188 	}
1189 #endif /* VFS_AIO */
1190 }
1191 
1192 #ifdef VFS_AIO
1193 /*
1194  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1195  * technique is done in this code.
1196  */
1197 static int
1198 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
1199 {
1200 	struct filedesc *fdp;
1201 	struct file *fp;
1202 	unsigned int fd;
1203 	struct socket *so;
1204 	int s;
1205 	int error;
1206 	int opcode;
1207 	struct aiocblist *aiocbe;
1208 	struct aioproclist *aiop;
1209 	struct kaioinfo *ki;
1210 	struct kevent kev;
1211 	struct kqueue *kq;
1212 	struct file *kq_fp;
1213 
1214 	if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL)
1215 		TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
1216 	else
1217 		aiocbe = zalloc (aiocb_zone);
1218 
1219 	aiocbe->inputcharge = 0;
1220 	aiocbe->outputcharge = 0;
1221 	SLIST_INIT(&aiocbe->klist);
1222 
1223 	suword(&job->_aiocb_private.status, -1);
1224 	suword(&job->_aiocb_private.error, 0);
1225 	suword(&job->_aiocb_private.kernelinfo, -1);
1226 
1227 	error = copyin((caddr_t)job, (caddr_t) &aiocbe->uaiocb, sizeof
1228 	    aiocbe->uaiocb);
1229 	if (error) {
1230 		suword(&job->_aiocb_private.error, error);
1231 
1232 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1233 		return error;
1234 	}
1235 
1236 	/* Save userspace address of the job info. */
1237 	aiocbe->uuaiocb = job;
1238 
1239 	/* Get the opcode. */
1240 	if (type != LIO_NOP)
1241 		aiocbe->uaiocb.aio_lio_opcode = type;
1242 	opcode = aiocbe->uaiocb.aio_lio_opcode;
1243 
1244 	/* Get the fd info for process. */
1245 	fdp = p->p_fd;
1246 
1247 	/*
1248 	 * Range check file descriptor.
1249 	 */
1250 	fd = aiocbe->uaiocb.aio_fildes;
1251 	if (fd >= fdp->fd_nfiles) {
1252 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1253 		if (type == 0)
1254 			suword(&job->_aiocb_private.error, EBADF);
1255 		return EBADF;
1256 	}
1257 
1258 	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1259 	if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1260 	    0))) {
1261 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1262 		if (type == 0)
1263 			suword(&job->_aiocb_private.error, EBADF);
1264 		return EBADF;
1265 	}
1266 
1267 	if (aiocbe->uaiocb.aio_offset == -1LL) {
1268 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1269 		if (type == 0)
1270 			suword(&job->_aiocb_private.error, EINVAL);
1271 		return EINVAL;
1272 	}
1273 
1274 	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1275 	if (error) {
1276 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1277 		if (type == 0)
1278 			suword(&job->_aiocb_private.error, EINVAL);
1279 		return error;
1280 	}
1281 
1282 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1283 	if (jobrefid == LONG_MAX)
1284 		jobrefid = 1;
1285 	else
1286 		jobrefid++;
1287 
1288 	if (opcode == LIO_NOP) {
1289 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1290 		if (type == 0) {
1291 			suword(&job->_aiocb_private.error, 0);
1292 			suword(&job->_aiocb_private.status, 0);
1293 			suword(&job->_aiocb_private.kernelinfo, 0);
1294 		}
1295 		return 0;
1296 	}
1297 
1298 	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1299 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1300 		if (type == 0) {
1301 			suword(&job->_aiocb_private.status, 0);
1302 			suword(&job->_aiocb_private.error, EINVAL);
1303 		}
1304 		return EINVAL;
1305 	}
1306 
1307 	fhold(fp);
1308 
1309 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1310 		kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1311 		kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1312 	}
1313 	else {
1314 		/*
1315 		 * This method for requesting kevent-based notification won't
1316 		 * work on the alpha, since we're passing in a pointer
1317 		 * via aio_lio_opcode, which is an int.  Use the SIGEV_KEVENT-
1318 		 * based method instead.
1319 		 */
1320 		struct kevent *kevp;
1321 
1322 		kevp = (struct kevent *)job->aio_lio_opcode;
1323 		if (kevp == NULL)
1324 			goto no_kqueue;
1325 
1326 		error = copyin((caddr_t)kevp, (caddr_t)&kev, sizeof(kev));
1327 		if (error)
1328 			goto aqueue_fail;
1329 	}
1330 	if ((u_int)kev.ident >= fdp->fd_nfiles ||
1331 	    (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1332 	    (kq_fp->f_type != DTYPE_KQUEUE)) {
1333 		error = EBADF;
1334 		goto aqueue_fail;
1335 	}
1336 	kq = (struct kqueue *)kq_fp->f_data;
1337 	kev.ident = (uintptr_t)aiocbe;
1338 	kev.filter = EVFILT_AIO;
1339 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1340 	error = kqueue_register(kq, &kev, p);
1341 aqueue_fail:
1342 	if (error) {
1343 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1344 		if (type == 0)
1345 			suword(&job->_aiocb_private.error, error);
1346 		goto done;
1347 	}
1348 no_kqueue:
1349 
1350 	suword(&job->_aiocb_private.error, EINPROGRESS);
1351 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1352 	aiocbe->userproc = p;
1353 	aiocbe->jobflags = 0;
1354 	aiocbe->lio = lj;
1355 	ki = p->p_aioinfo;
1356 
1357 	if (fp->f_type == DTYPE_SOCKET) {
1358 		/*
1359 		 * Alternate queueing for socket ops: Reach down into the
1360 		 * descriptor to get the socket data.  Then check to see if the
1361 		 * socket is ready to be read or written (based on the requested
1362 		 * operation).
1363 		 *
1364 		 * If it is not ready for io, then queue the aiocbe on the
1365 		 * socket, and set the flags so we get a call when sbnotify()
1366 		 * happens.
1367 		 */
1368 		so = (struct socket *)fp->f_data;
1369 		s = splnet();
1370 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1371 		    LIO_WRITE) && (!sowriteable(so)))) {
1372 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1373 			TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1374 			if (opcode == LIO_READ)
1375 				so->so_rcv.sb_flags |= SB_AIO;
1376 			else
1377 				so->so_snd.sb_flags |= SB_AIO;
1378 			aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1379 			ki->kaio_queue_count++;
1380 			num_queue_count++;
1381 			splx(s);
1382 			error = 0;
1383 			goto done;
1384 		}
1385 		splx(s);
1386 	}
1387 
1388 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1389 		goto done;
1390 	if (error > 0) {
1391 		suword(&job->_aiocb_private.status, 0);
1392 		aiocbe->uaiocb._aiocb_private.error = error;
1393 		suword(&job->_aiocb_private.error, error);
1394 		goto done;
1395 	}
1396 
1397 	/* No buffer for daemon I/O. */
1398 	aiocbe->bp = NULL;
1399 
1400 	ki->kaio_queue_count++;
1401 	if (lj)
1402 		lj->lioj_queue_count++;
1403 	s = splnet();
1404 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1405 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1406 	splx(s);
1407 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1408 
1409 	num_queue_count++;
1410 	error = 0;
1411 
1412 	/*
1413 	 * If we don't have a free AIO process, and we are below our quota, then
1414 	 * start one.  Otherwise, depend on the subsequent I/O completions to
1415 	 * pick-up this job.  If we don't sucessfully create the new process
1416 	 * (thread) due to resource issues, we return an error for now (EAGAIN),
1417 	 * which is likely not the correct thing to do.
1418 	 */
1419 retryproc:
1420 	s = splnet();
1421 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1422 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1423 		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1424 		aiop->aioprocflags &= ~AIOP_FREE;
1425 		wakeup(aiop->aioproc);
1426 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1427 	    ((ki->kaio_active_count + num_aio_resv_start) <
1428 	    ki->kaio_maxactive_count)) {
1429 		num_aio_resv_start++;
1430 		if ((error = aio_newproc()) == 0) {
1431 			num_aio_resv_start--;
1432 			p->p_retval[0] = 0;
1433 			goto retryproc;
1434 		}
1435 		num_aio_resv_start--;
1436 	}
1437 	splx(s);
1438 done:
1439 	fdrop(fp, p);
1440 	return error;
1441 }
1442 
1443 /*
1444  * This routine queues an AIO request, checking for quotas.
1445  */
1446 static int
1447 aio_aqueue(struct proc *p, struct aiocb *job, int type)
1448 {
1449 	struct kaioinfo *ki;
1450 
1451 	if (p->p_aioinfo == NULL)
1452 		aio_init_aioinfo(p);
1453 
1454 	if (num_queue_count >= max_queue_count)
1455 		return EAGAIN;
1456 
1457 	ki = p->p_aioinfo;
1458 	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1459 		return EAGAIN;
1460 
1461 	return _aio_aqueue(p, job, NULL, type);
1462 }
1463 #endif /* VFS_AIO */
1464 
1465 /*
1466  * Support the aio_return system call, as a side-effect, kernel resources are
1467  * released.
1468  */
1469 int
1470 aio_return(struct proc *p, struct aio_return_args *uap)
1471 {
1472 #ifndef VFS_AIO
1473 	return ENOSYS;
1474 #else
1475 	int s;
1476 	int jobref;
1477 	struct aiocblist *cb, *ncb;
1478 	struct aiocb *ujob;
1479 	struct kaioinfo *ki;
1480 
1481 	ki = p->p_aioinfo;
1482 	if (ki == NULL)
1483 		return EINVAL;
1484 
1485 	ujob = uap->aiocbp;
1486 
1487 	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1488 	if (jobref == -1 || jobref == 0)
1489 		return EINVAL;
1490 
1491 	s = splnet();
1492 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1493 	    plist)) {
1494 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1495 		    jobref) {
1496 			splx(s);
1497 			if (ujob == cb->uuaiocb) {
1498 				p->p_retval[0] =
1499 				    cb->uaiocb._aiocb_private.status;
1500 			} else
1501 				p->p_retval[0] = EFAULT;
1502 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1503 				curproc->p_stats->p_ru.ru_oublock +=
1504 				    cb->outputcharge;
1505 				cb->outputcharge = 0;
1506 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1507 				curproc->p_stats->p_ru.ru_inblock +=
1508 				    cb->inputcharge;
1509 				cb->inputcharge = 0;
1510 			}
1511 			aio_free_entry(cb);
1512 			return 0;
1513 		}
1514 	}
1515 	splx(s);
1516 
1517 	s = splbio();
1518 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1519 		ncb = TAILQ_NEXT(cb, plist);
1520 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1521 		    == jobref) {
1522 			splx(s);
1523 			if (ujob == cb->uuaiocb) {
1524 				p->p_retval[0] =
1525 				    cb->uaiocb._aiocb_private.status;
1526 			} else
1527 				p->p_retval[0] = EFAULT;
1528 			aio_free_entry(cb);
1529 			return 0;
1530 		}
1531 	}
1532 	splx(s);
1533 
1534 	return (EINVAL);
1535 #endif /* VFS_AIO */
1536 }
1537 
1538 /*
1539  * Allow a process to wakeup when any of the I/O requests are completed.
1540  */
1541 int
1542 aio_suspend(struct proc *p, struct aio_suspend_args *uap)
1543 {
1544 #ifndef VFS_AIO
1545 	return ENOSYS;
1546 #else
1547 	struct timeval atv;
1548 	struct timespec ts;
1549 	struct aiocb *const *cbptr, *cbp;
1550 	struct kaioinfo *ki;
1551 	struct aiocblist *cb;
1552 	int i;
1553 	int njoblist;
1554 	int error, s, timo;
1555 	int *ijoblist;
1556 	struct aiocb **ujoblist;
1557 
1558 	if (uap->nent >= AIO_LISTIO_MAX)
1559 		return EINVAL;
1560 
1561 	timo = 0;
1562 	if (uap->timeout) {
1563 		/* Get timespec struct. */
1564 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1565 			return error;
1566 
1567 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1568 			return (EINVAL);
1569 
1570 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
1571 		if (itimerfix(&atv))
1572 			return (EINVAL);
1573 		timo = tvtohz(&atv);
1574 	}
1575 
1576 	ki = p->p_aioinfo;
1577 	if (ki == NULL)
1578 		return EAGAIN;
1579 
1580 	njoblist = 0;
1581 	ijoblist = zalloc(aiol_zone);
1582 	ujoblist = zalloc(aiol_zone);
1583 	cbptr = uap->aiocbp;
1584 
1585 	for (i = 0; i < uap->nent; i++) {
1586 		cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
1587 		if (cbp == 0)
1588 			continue;
1589 		ujoblist[njoblist] = cbp;
1590 		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1591 		njoblist++;
1592 	}
1593 
1594 	if (njoblist == 0) {
1595 		zfree(aiol_zone, ijoblist);
1596 		zfree(aiol_zone, ujoblist);
1597 		return 0;
1598 	}
1599 
1600 	error = 0;
1601 	for (;;) {
1602 		for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb =
1603 		    TAILQ_NEXT(cb, plist)) {
1604 			for (i = 0; i < njoblist; i++) {
1605 				if (((intptr_t)
1606 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1607 				    ijoblist[i]) {
1608 					if (ujoblist[i] != cb->uuaiocb)
1609 						error = EINVAL;
1610 					zfree(aiol_zone, ijoblist);
1611 					zfree(aiol_zone, ujoblist);
1612 					return error;
1613 				}
1614 			}
1615 		}
1616 
1617 		s = splbio();
1618 		for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1619 		    TAILQ_NEXT(cb, plist)) {
1620 			for (i = 0; i < njoblist; i++) {
1621 				if (((intptr_t)
1622 				    cb->uaiocb._aiocb_private.kernelinfo) ==
1623 				    ijoblist[i]) {
1624 					splx(s);
1625 					if (ujoblist[i] != cb->uuaiocb)
1626 						error = EINVAL;
1627 					zfree(aiol_zone, ijoblist);
1628 					zfree(aiol_zone, ujoblist);
1629 					return error;
1630 				}
1631 			}
1632 		}
1633 
1634 		ki->kaio_flags |= KAIO_WAKEUP;
1635 		error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1636 		splx(s);
1637 
1638 		if (error == ERESTART || error == EINTR) {
1639 			zfree(aiol_zone, ijoblist);
1640 			zfree(aiol_zone, ujoblist);
1641 			return EINTR;
1642 		} else if (error == EWOULDBLOCK) {
1643 			zfree(aiol_zone, ijoblist);
1644 			zfree(aiol_zone, ujoblist);
1645 			return EAGAIN;
1646 		}
1647 	}
1648 
1649 /* NOTREACHED */
1650 	return EINVAL;
1651 #endif /* VFS_AIO */
1652 }
1653 
1654 /*
1655  * aio_cancel cancels any non-physio aio operations not currently in
1656  * progress.
1657  */
1658 int
1659 aio_cancel(struct proc *p, struct aio_cancel_args *uap)
1660 {
1661 #ifndef VFS_AIO
1662 	return ENOSYS;
1663 #else
1664 	struct kaioinfo *ki;
1665 	struct aiocblist *cbe, *cbn;
1666 	struct file *fp;
1667 	struct filedesc *fdp;
1668 	struct socket *so;
1669 	struct proc *po;
1670 	int s,error;
1671 	int cancelled=0;
1672 	int notcancelled=0;
1673 	struct vnode *vp;
1674 
1675 	fdp = p->p_fd;
1676 
1677 	fp = fdp->fd_ofiles[uap->fd];
1678 
1679 	if (fp == NULL) {
1680 		return EBADF;
1681 	}
1682 
1683         if (fp->f_type == DTYPE_VNODE) {
1684 		vp = (struct vnode *)fp->f_data;
1685 
1686 		if (vn_isdisk(vp,&error)) {
1687 			p->p_retval[0] = AIO_NOTCANCELED;
1688         	        return 0;
1689 		}
1690 	} else if (fp->f_type == DTYPE_SOCKET) {
1691 		so = (struct socket *)fp->f_data;
1692 
1693 		s = splnet();
1694 
1695 		for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1696 			cbn = TAILQ_NEXT(cbe, list);
1697 			if ((uap->aiocbp == NULL) ||
1698 				(uap->aiocbp == cbe->uuaiocb) ) {
1699 				po = cbe->userproc;
1700 				ki = po->p_aioinfo;
1701 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1702 				TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1703 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1704 				if (ki->kaio_flags & KAIO_WAKEUP) {
1705 					wakeup(po);
1706 				}
1707 				cbe->jobstate = JOBST_JOBFINISHED;
1708 				cbe->uaiocb._aiocb_private.status=-1;
1709 				cbe->uaiocb._aiocb_private.error=ECANCELED;
1710 				cancelled++;
1711 /* XXX cancelled, knote? */
1712 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1713 				    SIGEV_SIGNAL)
1714 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1715 				if (uap->aiocbp)
1716 					break;
1717 			}
1718 		}
1719 
1720 		splx(s);
1721 
1722 		if ((cancelled) && (uap->aiocbp)) {
1723 			p->p_retval[0] = AIO_CANCELED;
1724 			return 0;
1725 		}
1726 
1727 	}
1728 
1729 	ki=p->p_aioinfo;
1730 
1731 	s = splnet();
1732 
1733 	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1734 		cbn = TAILQ_NEXT(cbe, plist);
1735 
1736 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1737 		    ((uap->aiocbp == NULL ) ||
1738 		     (uap->aiocbp == cbe->uuaiocb))) {
1739 
1740 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1741 				TAILQ_REMOVE(&aio_jobs, cbe, list);
1742                                 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1743                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1744                                     plist);
1745 				cancelled++;
1746 				ki->kaio_queue_finished_count++;
1747 				cbe->jobstate = JOBST_JOBFINISHED;
1748 				cbe->uaiocb._aiocb_private.status = -1;
1749 				cbe->uaiocb._aiocb_private.error = ECANCELED;
1750 /* XXX cancelled, knote? */
1751 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1752 				    SIGEV_SIGNAL)
1753 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1754 			} else {
1755 				notcancelled++;
1756 			}
1757 		}
1758 	}
1759 
1760 	splx(s);
1761 
1762 
1763 	if (notcancelled) {
1764 		p->p_retval[0] = AIO_NOTCANCELED;
1765 		return 0;
1766 	}
1767 
1768 	if (cancelled) {
1769 		p->p_retval[0] = AIO_CANCELED;
1770 		return 0;
1771 	}
1772 
1773 	p->p_retval[0] = AIO_ALLDONE;
1774 
1775 	return 0;
1776 #endif /* VFS_AIO */
1777 }
1778 
1779 /*
1780  * aio_error is implemented in the kernel level for compatibility purposes only.
1781  * For a user mode async implementation, it would be best to do it in a userland
1782  * subroutine.
1783  */
1784 int
1785 aio_error(struct proc *p, struct aio_error_args *uap)
1786 {
1787 #ifndef VFS_AIO
1788 	return ENOSYS;
1789 #else
1790 	int s;
1791 	struct aiocblist *cb;
1792 	struct kaioinfo *ki;
1793 	int jobref;
1794 
1795 	ki = p->p_aioinfo;
1796 	if (ki == NULL)
1797 		return EINVAL;
1798 
1799 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1800 	if ((jobref == -1) || (jobref == 0))
1801 		return EINVAL;
1802 
1803 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1804 	    plist)) {
1805 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1806 		    jobref) {
1807 			p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1808 			return 0;
1809 		}
1810 	}
1811 
1812 	s = splnet();
1813 
1814 	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1815 	    plist)) {
1816 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1817 		    jobref) {
1818 			p->p_retval[0] = EINPROGRESS;
1819 			splx(s);
1820 			return 0;
1821 		}
1822 	}
1823 
1824 	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1825 	    plist)) {
1826 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1827 		    jobref) {
1828 			p->p_retval[0] = EINPROGRESS;
1829 			splx(s);
1830 			return 0;
1831 		}
1832 	}
1833 	splx(s);
1834 
1835 	s = splbio();
1836 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1837 	    plist)) {
1838 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1839 		    jobref) {
1840 			p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1841 			splx(s);
1842 			return 0;
1843 		}
1844 	}
1845 
1846 	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1847 	    plist)) {
1848 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1849 		    jobref) {
1850 			p->p_retval[0] = EINPROGRESS;
1851 			splx(s);
1852 			return 0;
1853 		}
1854 	}
1855 	splx(s);
1856 
1857 #if (0)
1858 	/*
1859 	 * Hack for lio.
1860 	 */
1861 	status = fuword(&uap->aiocbp->_aiocb_private.status);
1862 	if (status == -1)
1863 		return fuword(&uap->aiocbp->_aiocb_private.error);
1864 #endif
1865 	return EINVAL;
1866 #endif /* VFS_AIO */
1867 }
1868 
1869 int
1870 aio_read(struct proc *p, struct aio_read_args *uap)
1871 {
1872 #ifndef VFS_AIO
1873 	return ENOSYS;
1874 #else
1875 	struct filedesc *fdp;
1876 	struct file *fp;
1877 	struct uio auio;
1878 	struct iovec aiov;
1879 	unsigned int fd;
1880 	int cnt;
1881 	struct aiocb iocb;
1882 	int error, pmodes;
1883 
1884 	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1885 	if ((pmodes & AIO_PMODE_SYNC) == 0)
1886 		return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ);
1887 
1888 	/* Get control block. */
1889 	if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb))
1890 	    != 0)
1891 		return error;
1892 
1893 	/* Get the fd info for process. */
1894 	fdp = p->p_fd;
1895 
1896 	/*
1897 	 * Range check file descriptor.
1898 	 */
1899 	fd = iocb.aio_fildes;
1900 	if (fd >= fdp->fd_nfiles)
1901 		return EBADF;
1902 	fp = fdp->fd_ofiles[fd];
1903 	if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
1904 		return EBADF;
1905 	if (iocb.aio_offset == -1LL)
1906 		return EINVAL;
1907 
1908 	auio.uio_resid = iocb.aio_nbytes;
1909 	if (auio.uio_resid < 0)
1910 		return (EINVAL);
1911 
1912 	/*
1913 	 * Process sync simply -- queue async request.
1914 	 */
1915 	if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0)
1916 		return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ);
1917 
1918 	aiov.iov_base = (void *)iocb.aio_buf;
1919 	aiov.iov_len = iocb.aio_nbytes;
1920 
1921 	auio.uio_iov = &aiov;
1922 	auio.uio_iovcnt = 1;
1923 	auio.uio_offset = iocb.aio_offset;
1924 	auio.uio_rw = UIO_READ;
1925 	auio.uio_segflg = UIO_USERSPACE;
1926 	auio.uio_procp = p;
1927 
1928 	cnt = iocb.aio_nbytes;
1929 	/*
1930 	 * Temporarily bump the ref count while reading to avoid the
1931 	 * descriptor being ripped out from under us.
1932 	 */
1933 	fhold(fp);
1934 	error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p);
1935 	fdrop(fp, p);
1936 	if (error && (auio.uio_resid != cnt) && (error == ERESTART || error ==
1937 	    EINTR || error == EWOULDBLOCK))
1938 		error = 0;
1939 	cnt -= auio.uio_resid;
1940 	p->p_retval[0] = cnt;
1941 	return error;
1942 #endif /* VFS_AIO */
1943 }
1944 
1945 int
1946 aio_write(struct proc *p, struct aio_write_args *uap)
1947 {
1948 #ifndef VFS_AIO
1949 	return ENOSYS;
1950 #else
1951 	struct filedesc *fdp;
1952 	struct file *fp;
1953 	struct uio auio;
1954 	struct iovec aiov;
1955 	unsigned int fd;
1956 	int cnt;
1957 	struct aiocb iocb;
1958 	int error;
1959 	int pmodes;
1960 
1961 	/*
1962 	 * Process sync simply -- queue async request.
1963 	 */
1964 	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1965 	if ((pmodes & AIO_PMODE_SYNC) == 0)
1966 		return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_WRITE);
1967 
1968 	if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb))
1969 	    != 0)
1970 		return error;
1971 
1972 	/* Get the fd info for process. */
1973 	fdp = p->p_fd;
1974 
1975 	/*
1976 	 * Range check file descriptor.
1977 	 */
1978 	fd = iocb.aio_fildes;
1979 	if (fd >= fdp->fd_nfiles)
1980 		return EBADF;
1981 	fp = fdp->fd_ofiles[fd];
1982 	if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
1983 		return EBADF;
1984 	if (iocb.aio_offset == -1LL)
1985 		return EINVAL;
1986 
1987 	aiov.iov_base = (void *)iocb.aio_buf;
1988 	aiov.iov_len = iocb.aio_nbytes;
1989 	auio.uio_iov = &aiov;
1990 	auio.uio_iovcnt = 1;
1991 	auio.uio_offset = iocb.aio_offset;
1992 
1993 	auio.uio_resid = iocb.aio_nbytes;
1994 	if (auio.uio_resid < 0)
1995 		return (EINVAL);
1996 
1997 	auio.uio_rw = UIO_WRITE;
1998 	auio.uio_segflg = UIO_USERSPACE;
1999 	auio.uio_procp = p;
2000 
2001 	cnt = iocb.aio_nbytes;
2002 	/*
2003 	 * Temporarily bump the ref count while writing to avoid the
2004 	 * descriptor being ripped out from under us.
2005 	 */
2006 	fhold(fp);
2007 	error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p);
2008 	fdrop(fp, p);
2009 	if (error) {
2010 		if (auio.uio_resid != cnt) {
2011 			if (error == ERESTART || error == EINTR || error ==
2012 			    EWOULDBLOCK)
2013 				error = 0;
2014 			if (error == EPIPE)
2015 				psignal(p, SIGPIPE);
2016 		}
2017 	}
2018 	cnt -= auio.uio_resid;
2019 	p->p_retval[0] = cnt;
2020 	return error;
2021 #endif /* VFS_AIO */
2022 }
2023 
2024 int
2025 lio_listio(struct proc *p, struct lio_listio_args *uap)
2026 {
2027 #ifndef VFS_AIO
2028 	return ENOSYS;
2029 #else
2030 	int nent, nentqueued;
2031 	struct aiocb *iocb, * const *cbptr;
2032 	struct aiocblist *cb;
2033 	struct kaioinfo *ki;
2034 	struct aio_liojob *lj;
2035 	int error, runningcode;
2036 	int nerror;
2037 	int i;
2038 	int s;
2039 
2040 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2041 		return EINVAL;
2042 
2043 	nent = uap->nent;
2044 	if (nent > AIO_LISTIO_MAX)
2045 		return EINVAL;
2046 
2047 	if (p->p_aioinfo == NULL)
2048 		aio_init_aioinfo(p);
2049 
2050 	if ((nent + num_queue_count) > max_queue_count)
2051 		return EAGAIN;
2052 
2053 	ki = p->p_aioinfo;
2054 	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
2055 		return EAGAIN;
2056 
2057 	lj = zalloc(aiolio_zone);
2058 	if (!lj)
2059 		return EAGAIN;
2060 
2061 	lj->lioj_flags = 0;
2062 	lj->lioj_buffer_count = 0;
2063 	lj->lioj_buffer_finished_count = 0;
2064 	lj->lioj_queue_count = 0;
2065 	lj->lioj_queue_finished_count = 0;
2066 	lj->lioj_ki = ki;
2067 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2068 
2069 	/*
2070 	 * Setup signal.
2071 	 */
2072 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2073 		error = copyin(uap->sig, &lj->lioj_signal,
2074 		    sizeof(lj->lioj_signal));
2075 		if (error)
2076 			return error;
2077 		lj->lioj_flags |= LIOJ_SIGNAL;
2078 		lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
2079 	} else
2080 		lj->lioj_flags &= ~LIOJ_SIGNAL;
2081 
2082 	/*
2083 	 * Get pointers to the list of I/O requests.
2084 	 */
2085 	nerror = 0;
2086 	nentqueued = 0;
2087 	cbptr = uap->acb_list;
2088 	for (i = 0; i < uap->nent; i++) {
2089 		iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2090 		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
2091 			error = _aio_aqueue(p, iocb, lj, 0);
2092 			if (error == 0)
2093 				nentqueued++;
2094 			else
2095 				nerror++;
2096 		}
2097 	}
2098 
2099 	/*
2100 	 * If we haven't queued any, then just return error.
2101 	 */
2102 	if (nentqueued == 0)
2103 		return 0;
2104 
2105 	/*
2106 	 * Calculate the appropriate error return.
2107 	 */
2108 	runningcode = 0;
2109 	if (nerror)
2110 		runningcode = EIO;
2111 
2112 	if (uap->mode == LIO_WAIT) {
2113 		int command, found, jobref;
2114 
2115 		for (;;) {
2116 			found = 0;
2117 			for (i = 0; i < uap->nent; i++) {
2118 				/*
2119 				 * Fetch address of the control buf pointer in
2120 				 * user space.
2121 				 */
2122 				iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2123 				if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2124 				    == 0))
2125 					continue;
2126 
2127 				/*
2128 				 * Fetch the associated command from user space.
2129 				 */
2130 				command = fuword(&iocb->aio_lio_opcode);
2131 				if (command == LIO_NOP) {
2132 					found++;
2133 					continue;
2134 				}
2135 
2136 				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2137 
2138 				for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb;
2139 				    cb = TAILQ_NEXT(cb, plist)) {
2140 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2141 					    == jobref) {
2142 						if (cb->uaiocb.aio_lio_opcode
2143 						    == LIO_WRITE) {
2144 							curproc->p_stats->p_ru.ru_oublock
2145 							    +=
2146 							    cb->outputcharge;
2147 							cb->outputcharge = 0;
2148 						} else if (cb->uaiocb.aio_lio_opcode
2149 						    == LIO_READ) {
2150 							curproc->p_stats->p_ru.ru_inblock
2151 							    += cb->inputcharge;
2152 							cb->inputcharge = 0;
2153 						}
2154 						found++;
2155 						break;
2156 					}
2157 				}
2158 
2159 				s = splbio();
2160 				for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb;
2161 				    cb = TAILQ_NEXT(cb, plist)) {
2162 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2163 					    == jobref) {
2164 						found++;
2165 						break;
2166 					}
2167 				}
2168 				splx(s);
2169 			}
2170 
2171 			/*
2172 			 * If all I/Os have been disposed of, then we can
2173 			 * return.
2174 			 */
2175 			if (found == nentqueued)
2176 				return runningcode;
2177 
2178 			ki->kaio_flags |= KAIO_WAKEUP;
2179 			error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2180 
2181 			if (error == EINTR)
2182 				return EINTR;
2183 			else if (error == EWOULDBLOCK)
2184 				return EAGAIN;
2185 		}
2186 	}
2187 
2188 	return runningcode;
2189 #endif /* VFS_AIO */
2190 }
2191 
2192 #ifdef VFS_AIO
2193 /*
2194  * This is a wierd hack so that we can post a signal.  It is safe to do so from
2195  * a timeout routine, but *not* from an interrupt routine.
2196  */
2197 static void
2198 process_signal(void *aioj)
2199 {
2200 	struct aiocblist *aiocbe = aioj;
2201 	struct aio_liojob *lj = aiocbe->lio;
2202 	struct aiocb *cb = &aiocbe->uaiocb;
2203 
2204 	if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2205 	    (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2206 		psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2207 		lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2208 	}
2209 
2210 	if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2211 		psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2212 }
2213 
2214 /*
2215  * Interrupt handler for physio, performs the necessary process wakeups, and
2216  * signals.
2217  */
2218 static void
2219 aio_physwakeup(struct buf *bp)
2220 {
2221 	struct aiocblist *aiocbe;
2222 	struct proc *p;
2223 	struct kaioinfo *ki;
2224 	struct aio_liojob *lj;
2225 
2226 	wakeup((caddr_t)bp);
2227 
2228 	aiocbe = (struct aiocblist *)bp->b_spc;
2229 	if (aiocbe) {
2230 		p = bp->b_caller1;
2231 
2232 		aiocbe->jobstate = JOBST_JOBBFINISHED;
2233 		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2234 		aiocbe->uaiocb._aiocb_private.error = 0;
2235 		aiocbe->jobflags |= AIOCBLIST_DONE;
2236 
2237 		if (bp->b_ioflags & BIO_ERROR)
2238 			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2239 
2240 		lj = aiocbe->lio;
2241 		if (lj) {
2242 			lj->lioj_buffer_finished_count++;
2243 
2244 			/*
2245 			 * wakeup/signal if all of the interrupt jobs are done.
2246 			 */
2247 			if (lj->lioj_buffer_finished_count ==
2248 			    lj->lioj_buffer_count) {
2249 				/*
2250 				 * Post a signal if it is called for.
2251 				 */
2252 				if ((lj->lioj_flags &
2253 				    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2254 				    LIOJ_SIGNAL) {
2255 					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2256 					timeout(process_signal, aiocbe, 0);
2257 				}
2258 			}
2259 		}
2260 
2261 		ki = p->p_aioinfo;
2262 		if (ki) {
2263 			ki->kaio_buffer_finished_count++;
2264 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2265 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2266 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2267 
2268 			KNOTE(&aiocbe->klist, 0);
2269 			/* Do the wakeup. */
2270 			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2271 				ki->kaio_flags &= ~KAIO_WAKEUP;
2272 				wakeup(p);
2273 			}
2274 		}
2275 
2276 		if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2277 			timeout(process_signal, aiocbe, 0);
2278 	}
2279 }
2280 #endif /* VFS_AIO */
2281 
2282 int
2283 aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap)
2284 {
2285 #ifndef VFS_AIO
2286 	return ENOSYS;
2287 #else
2288 	struct timeval atv;
2289 	struct timespec ts;
2290 	struct aiocb **cbptr;
2291 	struct kaioinfo *ki;
2292 	struct aiocblist *cb = NULL;
2293 	int error, s, timo;
2294 
2295 	suword(uap->aiocbp, (int)NULL);
2296 
2297 	timo = 0;
2298 	if (uap->timeout) {
2299 		/* Get timespec struct. */
2300 		error = copyin((caddr_t)uap->timeout, (caddr_t)&ts,
2301 		    sizeof(ts));
2302 		if (error)
2303 			return error;
2304 
2305 		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2306 			return (EINVAL);
2307 
2308 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
2309 		if (itimerfix(&atv))
2310 			return (EINVAL);
2311 		timo = tvtohz(&atv);
2312 	}
2313 
2314 	ki = p->p_aioinfo;
2315 	if (ki == NULL)
2316 		return EAGAIN;
2317 
2318 	cbptr = uap->aiocbp;
2319 
2320 	for (;;) {
2321 		if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2322 			suword(uap->aiocbp, (int)cb->uuaiocb);
2323 			p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2324 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2325 				curproc->p_stats->p_ru.ru_oublock +=
2326 				    cb->outputcharge;
2327 				cb->outputcharge = 0;
2328 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2329 				curproc->p_stats->p_ru.ru_inblock +=
2330 				    cb->inputcharge;
2331 				cb->inputcharge = 0;
2332 			}
2333 			aio_free_entry(cb);
2334 			return cb->uaiocb._aiocb_private.error;
2335 		}
2336 
2337 		s = splbio();
2338  		if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2339 			splx(s);
2340 			suword(uap->aiocbp, (int)cb->uuaiocb);
2341 			p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2342 			aio_free_entry(cb);
2343 			return cb->uaiocb._aiocb_private.error;
2344 		}
2345 
2346 		ki->kaio_flags |= KAIO_WAKEUP;
2347 		error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2348 		splx(s);
2349 
2350 		if (error == ERESTART)
2351 			return EINTR;
2352 		else if (error < 0)
2353 			return error;
2354 		else if (error == EINTR)
2355 			return EINTR;
2356 		else if (error == EWOULDBLOCK)
2357 			return EAGAIN;
2358 	}
2359 #endif /* VFS_AIO */
2360 }
2361 
2362 
2363 #ifndef VFS_AIO
2364 static int
2365 filt_aioattach(struct knote *kn)
2366 {
2367 
2368 	return (ENXIO);
2369 }
2370 
2371 struct filterops aio_filtops =
2372 	{ 0, filt_aioattach, NULL, NULL };
2373 
2374 #else
2375 static int
2376 filt_aioattach(struct knote *kn)
2377 {
2378 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2379 
2380 	/*
2381 	 * The aiocbe pointer must be validated before using it, so
2382 	 * registration is restricted to the kernel; the user cannot
2383 	 * set EV_FLAG1.
2384 	 */
2385 	if ((kn->kn_flags & EV_FLAG1) == 0)
2386 		return (EPERM);
2387 	kn->kn_flags &= ~EV_FLAG1;
2388 
2389 	SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2390 
2391 	return (0);
2392 }
2393 
2394 static void
2395 filt_aiodetach(struct knote *kn)
2396 {
2397 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2398 	int s = splhigh();	 /* XXX no clue, so overkill */
2399 
2400 	SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2401 	splx(s);
2402 }
2403 
2404 /*ARGSUSED*/
2405 static int
2406 filt_aio(struct knote *kn, long hint)
2407 {
2408 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2409 
2410 	kn->kn_data = 0;		/* XXX data returned? */
2411 	if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2412 	    aiocbe->jobstate != JOBST_JOBBFINISHED)
2413 		return (0);
2414 	kn->kn_flags |= EV_EOF;
2415 	return (1);
2416 }
2417 
2418 struct filterops aio_filtops =
2419 	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
2420 #endif /* VFS_AIO */
2421