xref: /freebsd/sys/kern/vfs_aio.c (revision ce834215a70ff69e7e222827437116eee2f9ac6f)
1 /*
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  *
16  * $Id: vfs_aio.c,v 1.1 1997/06/16 00:27:26 dyson Exp $
17  */
18 
19 /*
20  * This file contains support for the POSIX.4 AIO facility.
21  *
22  * The initial version provides only the (bogus) synchronous semantics
23  * but will support async in the future.  Note that a bit
24  * in a private field allows the user mode subroutine to adapt
25  * the kernel operations to true POSIX.4 for future compatibility.
26  *
27  * This code is used to support true POSIX.4 AIO/LIO with the help
28  * of a user mode subroutine package.  Note that eventually more support
29  * will be pushed into the kernel.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/sysent.h>
35 #include <sys/sysproto.h>
36 #include <sys/namei.h>
37 #include <sys/filedesc.h>
38 #include <sys/kernel.h>
39 #include <sys/fcntl.h>
40 #include <sys/file.h>
41 #include <sys/stat.h>
42 #include <sys/unistd.h>
43 #include <sys/vnode.h>
44 #include <sys/mount.h>
45 #include <sys/proc.h>
46 #include <sys/uio.h>
47 #include <sys/malloc.h>
48 #include <sys/dirent.h>
49 #include <sys/signalvar.h>
50 #include <sys/queue.h>
51 
52 #include <vm/vm.h>
53 #include <vm/vm_param.h>
54 #include <vm/vm_object.h>
55 #include <vm/vm_extern.h>
56 #include <vm/pmap.h>
57 #include <vm/vm_map.h>
58 #include <sys/sysctl.h>
59 #include <sys/aio.h>
60 
61 #define AIOCBLIST_CANCELLED	0x1
62 #define AIOCBLIST_RUNDOWN	0x4
63 #define AIOCBLIST_ASYNCFREE	0x8
64 #define AIOCBLIST_SUSPEND	0x10
65 
66 #if 0
67 #define DEBUGAIO
68 #define DIAGNOSTIC
69 #endif
70 
71 static	int jobrefid;
72 
73 #define JOBST_NULL		0x0
74 #define	JOBST_JOBQPROC		0x1
75 #define JOBST_JOBQGLOBAL	0x2
76 #define JOBST_JOBRUNNING	0x3
77 #define JOBST_JOBFINISHED	0x4
78 
79 #define MAX_AIO_PER_PROC	32
80 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
81 #define MAX_AIO_PROCS		128
82 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
83 #define TARGET_AIO_PROCS	64
84 
85 /*
86  * Job queue item
87  */
88 struct aiocblist {
89 	TAILQ_ENTRY (aiocblist) list;		/* List of jobs */
90 	TAILQ_ENTRY (aiocblist) plist;		/* List of jobs for proc */
91 	int	jobflags;
92 	int	jobstate;
93 	struct	proc *userproc;			/* User process */
94 	struct	aioproclist	*jobaioproc;	/* AIO process descriptor */
95 	struct	aiocb uaiocb;			/* Kernel I/O control block */
96 };
97 
98 #define AIOP_FREE	0x1			/* proc on free queue */
99 /*
100  * AIO process info
101  */
102 struct aioproclist {
103 	int aioprocflags;			/* AIO proc flags */
104 	TAILQ_ENTRY(aioproclist) list;		/* List of processes */
105 	struct proc *aioproc;			/* The AIO thread */
106 	TAILQ_HEAD (,aiocblist) jobtorun;	/* suggested job to run */
107 };
108 
109 struct kaioinfo {
110 	int	kaio_maxactive_count;	/* maximum number of AIOs */
111 	int	kaio_active_count;	/* number of currently used AIOs */
112 	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
113 	int	kaio_queue_count;	/* size of AIO queue */
114 	TAILQ_HEAD (,aiocblist)	kaio_jobqueue;	/* job queue for process */
115 	TAILQ_HEAD (,aiocblist)	kaio_jobdone;	/* done queue for process */
116 };
117 
118 TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc;
119 TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
120 TAILQ_HEAD(,aiocblist) aio_freejobs;
121 
122 int max_aio_procs = MAX_AIO_PROCS;
123 int num_aio_procs = 0;
124 int target_aio_procs = TARGET_AIO_PROCS;
125 
126 int max_queue_count = MAX_AIO_QUEUE;
127 int num_queue_count = 0;
128 
129 void aio_init_aioinfo(struct proc *p) ;
130 void aio_onceonly(void) ;
131 void aio_proc_rundown(struct proc *p) ;
132 int aio_free_entry(struct aiocblist *aiocbe);
133 void aio_cancel_internal(struct aiocblist *aiocbe);
134 void aio_process(struct aiocblist *aiocbe);
135 void pmap_newvmspace(struct vmspace *);
136 static int aio_newproc(void) ;
137 static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ;
138 static void aio_marksuspend(struct proc *p, int njobs, int *joblist, int set) ;
139 
140 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
141 
142 /*
143  * Startup initialization
144  */
145 void
146 aio_onceonly() {
147 	TAILQ_INIT(&aio_freeproc);
148 	TAILQ_INIT(&aio_activeproc);
149 	TAILQ_INIT(&aio_jobs);
150 	TAILQ_INIT(&aio_freejobs);
151 }
152 
153 /*
154  * Init the per-process aioinfo structure.
155  */
156 void
157 aio_init_aioinfo(struct proc *p) {
158 	struct kaioinfo *ki;
159 	if (p->p_aioinfo == NULL) {
160 		ki = malloc(sizeof (struct kaioinfo), M_AIO, M_WAITOK);
161 		p->p_aioinfo = ki;
162 		ki->kaio_maxactive_count = MAX_AIO_PER_PROC;
163 		ki->kaio_active_count = 0;
164 		ki->kaio_qallowed_count = MAX_AIO_QUEUE_PER_PROC;
165 		ki->kaio_queue_count = 0;
166 		TAILQ_INIT(&ki->kaio_jobdone);
167 		TAILQ_INIT(&ki->kaio_jobqueue);
168 	}
169 }
170 
171 /*
172  * Free a job entry.  Wait for completion if it is currently
173  * active, but don't delay forever.  If we delay, we return
174  * a flag that says that we have to restart the queue scan.
175  */
176 int
177 aio_free_entry(struct aiocblist *aiocbe) {
178 	struct kaioinfo *ki;
179 	struct aioproclist *aiop;
180 	struct proc *p;
181 
182 	if (aiocbe->jobstate == JOBST_NULL)
183 		panic("aio_free_entry: freeing already free job");
184 
185 	p = aiocbe->userproc;
186 	ki = p->p_aioinfo;
187 	if (ki == NULL)
188 		panic("aio_free_entry: missing p->p_aioinfo");
189 
190 	if (aiocbe->jobstate == JOBST_JOBRUNNING) {
191 		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
192 			return 0;
193 		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
194 		if (tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", hz*5)) {
195 			aiocbe->jobflags |= AIOCBLIST_ASYNCFREE;
196 			aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
197 			return 1;
198 		}
199 		aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
200 	}
201 	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
202 
203 	if (ki->kaio_queue_count <= 0)
204 		panic("aio_free_entry: process queue size <= 0");
205 	if (num_queue_count <= 0)
206 		panic("aio_free_entry: system wide queue size <= 0");
207 
208 	--ki->kaio_queue_count;
209 	--num_queue_count;
210 
211 	if ( aiocbe->jobstate == JOBST_JOBQPROC) {
212 		aiop = aiocbe->jobaioproc;
213 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
214 	} else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) {
215 		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
216 	} else if ( aiocbe->jobstate == JOBST_JOBFINISHED) {
217 		ki = p->p_aioinfo;
218 		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
219 	}
220 	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
221 	aiocbe->jobstate = JOBST_NULL;
222 	return 0;
223 }
224 
225 /*
226  * Rundown the jobs for a given process.
227  */
228 void
229 aio_proc_rundown(struct proc *p) {
230 	struct kaioinfo *ki;
231 	struct aiocblist *aiocbe, *aiocbn;
232 
233 	ki = p->p_aioinfo;
234 	if (ki == NULL)
235 		return;
236 
237 restart1:
238 	for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone);
239 		aiocbe;
240 		aiocbe = aiocbn) {
241 		aiocbn = TAILQ_NEXT(aiocbe, plist);
242 		if (aio_free_entry(aiocbe))
243 			goto restart1;
244 	}
245 
246 restart2:
247 	for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue);
248 		aiocbe;
249 		aiocbe = aiocbn) {
250 		aiocbn = TAILQ_NEXT(aiocbe, plist);
251 		if (aio_free_entry(aiocbe))
252 			goto restart2;
253 	}
254 	free(ki, M_AIO);
255 }
256 
257 /*
258  * Select a job to run (called by an AIO daemon)
259  */
260 static struct aiocblist *
261 aio_selectjob(struct aioproclist *aiop) {
262 
263 	struct aiocblist *aiocbe;
264 
265 	aiocbe = TAILQ_FIRST(&aiop->jobtorun);
266 	if (aiocbe) {
267 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
268 		return aiocbe;
269 	}
270 
271 	for (aiocbe = TAILQ_FIRST(&aio_jobs);
272 		aiocbe;
273 		aiocbe = TAILQ_NEXT(aiocbe, list)) {
274 		struct kaioinfo *ki;
275 		struct proc *userp;
276 
277 		userp = aiocbe->userproc;
278 		ki = userp->p_aioinfo;
279 
280 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
281 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
282 			return aiocbe;
283 		}
284 	}
285 
286 	return NULL;
287 }
288 
289 /*
290  * The AIO activity proper.
291  */
292 void
293 aio_process(struct aiocblist *aiocbe) {
294 	struct filedesc *fdp;
295 	struct proc *userp;
296 	struct aiocb *cb;
297 	struct file *fp;
298 	struct uio auio;
299 	struct iovec aiov;
300 	unsigned int fd;
301 	int cnt;
302 	int error;
303 
304 	userp = aiocbe->userproc;
305 	cb = &aiocbe->uaiocb;
306 
307 #ifdef DEBUGAIO
308 	printf("fd: %d, offset: 0x%x, address: 0x%x, size: %d\n",
309 		cb->aio_fildes, (int) cb->aio_offset,
310 			cb->aio_buf, cb->aio_nbytes);
311 	tsleep(curproc, PVM, "aioprc", hz);
312 #endif
313 	fdp = curproc->p_fd;
314 	/*
315 	 * Range check file descriptor
316 	 */
317 	fd = cb->aio_fildes;
318 	fp = fdp->fd_ofiles[fd];
319 
320 	aiov.iov_base = cb->aio_buf;
321 	aiov.iov_len = cb->aio_nbytes;
322 
323 	auio.uio_iov = &aiov;
324 	auio.uio_iovcnt = 1;
325 	auio.uio_offset = cb->aio_offset;
326 	auio.uio_resid = cb->aio_nbytes;
327 	cnt = cb->aio_nbytes;
328 	auio.uio_segflg = UIO_USERSPACE;
329 	auio.uio_procp = curproc;
330 
331 	if (cb->aio_lio_opcode == LIO_READ) {
332 		auio.uio_rw = UIO_READ;
333 		error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
334 	} else {
335 		auio.uio_rw = UIO_WRITE;
336 		error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
337 	}
338 
339 	if (error) {
340 		if (auio.uio_resid != cnt) {
341 			if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
342 				error = 0;
343 			if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
344 				psignal(userp, SIGPIPE);
345 		}
346 	}
347 
348 	cnt -= auio.uio_resid;
349 	cb->_aiocb_private.error = error;
350 	cb->_aiocb_private.status = cnt;
351 
352 	return;
353 
354 }
355 
356 /*
357  * The AIO daemon.
358  */
359 static void
360 aio_startproc(void *uproc)
361 {
362 	struct aioproclist *aiop;
363 
364 	/*
365 	 * Allocate and ready the aio control info
366 	 */
367 	aiop = malloc(sizeof *aiop, M_AIO, M_WAITOK);
368 	aiop->aioproc = curproc;
369 	aiop->aioprocflags |= AIOP_FREE;
370 	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
371 	TAILQ_INIT(&aiop->jobtorun);
372 
373 	/*
374 	 * Get rid of current address space
375 	 */
376 	if (curproc->p_vmspace->vm_refcnt == 1) {
377 		if (curproc->p_vmspace->vm_shm)
378 			shmexit(curproc);
379 		pmap_remove_pages(&curproc->p_vmspace->vm_pmap, 0, USRSTACK);
380 		vm_map_remove(&curproc->p_vmspace->vm_map, 0, USRSTACK);
381 	} else {
382 		vmspace_exec(curproc);
383 	}
384 
385 	/*
386 	 * Make up a name for the daemon
387 	 */
388 	strcpy(curproc->p_comm, "aiodaemon");
389 
390 	/*
391 	 * Get rid of our current filedescriptors
392 	 */
393 	fdfree(curproc);
394 	curproc->p_fd = NULL;
395 	curproc->p_ucred = crcopy(curproc->p_ucred);
396 	curproc->p_ucred->cr_uid = 0;
397 	curproc->p_ucred->cr_groups[0] = 1;
398 	curproc->p_flag |= P_SYSTEM;
399 
400 #ifdef DEBUGAIO
401 	printf("Started new process: %d\n", curproc->p_pid);
402 #endif
403 	wakeup(uproc);
404 
405 	while(1) {
406 		struct vmspace *myvm, *tmpvm;
407 		struct proc *cp = curproc;
408 		struct proc *up = NULL;
409 		struct	aiocblist *aiocbe;
410 
411 		if ((aiop->aioprocflags & AIOP_FREE) == 0) {
412 			TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
413 			aiop->aioprocflags |= AIOP_FREE;
414 		}
415 		tsleep(curproc, PZERO, "aiordy", 0);
416 		if (aiop->aioprocflags & AIOP_FREE) {
417 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
418 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
419 			aiop->aioprocflags &= ~AIOP_FREE;
420 		}
421 
422 		myvm = curproc->p_vmspace;
423 
424 		while ( aiocbe = aio_selectjob(aiop)) {
425 			struct aiocb *cb;
426 			struct kaioinfo *ki;
427 			struct proc *userp;
428 
429 			cb = &aiocbe->uaiocb;
430 			userp = aiocbe->userproc;
431 			ki = userp->p_aioinfo;
432 
433 			aiocbe->jobstate = JOBST_JOBRUNNING;
434 			if (userp != cp) {
435 				tmpvm = curproc->p_vmspace;
436 				curproc->p_vmspace = userp->p_vmspace;
437 				++curproc->p_vmspace->vm_refcnt;
438 				pmap_activate(curproc);
439 				if (tmpvm != myvm) {
440 					vmspace_free(tmpvm);
441 				}
442 				if (curproc->p_fd)
443 					fdfree(curproc);
444 				curproc->p_fd = fdshare(userp);
445 				cp = userp;
446 			}
447 
448 			ki->kaio_active_count++;
449 			aiocbe->jobaioproc = aiop;
450 			aio_process(aiocbe);
451 			--ki->kaio_active_count;
452 
453 			aiocbe->jobstate = JOBST_JOBFINISHED;
454 
455 			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
456 				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
457 				TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
458 			} else {
459 				TAILQ_REMOVE(&ki->kaio_jobqueue,
460 					aiocbe, plist);
461 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone,
462 					aiocbe, plist);
463 			}
464 
465 			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
466 				wakeup(aiocbe);
467 				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
468 			}
469 
470 			if (aiocbe->jobflags & AIOCBLIST_SUSPEND) {
471 				wakeup(userp);
472 				aiocbe->jobflags &= ~AIOCBLIST_SUSPEND;
473 			}
474 
475 			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
476 				psignal(userp, cb->aio_sigevent.sigev_signo);
477 			}
478 		}
479 
480 		if (cp != curproc) {
481 			tmpvm = curproc->p_vmspace;
482 			curproc->p_vmspace = myvm;
483 			pmap_activate(curproc);
484 			vmspace_free(tmpvm);
485 			if (curproc->p_fd)
486 				fdfree(curproc);
487 			curproc->p_fd = NULL;
488 			cp = curproc;
489 		}
490 	}
491 }
492 
493 /*
494  * Create a new AIO daemon.
495  */
496 static int
497 aio_newproc() {
498 	int error;
499 	int rval[2];
500 	struct rfork_args rfa;
501 	struct proc *p;
502 
503 	rfa.flags = RFMEM | RFPROC | RFCFDG;
504 
505 	if (error = rfork(curproc, &rfa, &rval[0]))
506 		return error;
507 
508 	cpu_set_fork_handler(p = pfind(rval[0]), aio_startproc, curproc);
509 
510 #ifdef DEBUGAIO
511 	printf("Waiting for new process: %d, count: %d\n",
512 		curproc->p_pid, num_aio_procs);
513 #endif
514 
515 	error = tsleep(curproc, PZERO, "aiosta", 5*hz);
516 	++num_aio_procs;
517 
518 	return error;
519 
520 }
521 
522 /*
523  * Queue a new AIO request.
524  */
525 static int
526 _aio_aqueue(struct proc *p, struct aiocb *job, int type) {
527 	struct filedesc *fdp;
528 	struct file *fp;
529 	unsigned int fd;
530 
531 	int error;
532 	int opcode;
533 	struct aiocblist *aiocbe;
534 	struct aioproclist *aiop;
535 	struct kaioinfo *ki;
536 
537 	if (aiocbe = TAILQ_FIRST(&aio_freejobs)) {
538 		TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
539 	} else {
540 		aiocbe = malloc (sizeof *aiocbe, M_AIO, M_WAITOK);
541 	}
542 
543 	error = copyin((caddr_t)job,
544 		(caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb);
545 	if (error) {
546 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
547 		return error;
548 	}
549 
550 
551 	/*
552 	 * Get the fd info for process
553 	 */
554 	fdp = p->p_fd;
555 
556 	/*
557 	 * Range check file descriptor
558 	 */
559 	fd = aiocbe->uaiocb.aio_fildes;
560 	if (fd >= fdp->fd_nfiles) {
561 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
562 		if (type == 0) {
563 			suword(&job->_aiocb_private.status, -1);
564 			suword(&job->_aiocb_private.error, EBADF);
565 		}
566 		return EBADF;
567 	}
568 
569 	fp = fdp->fd_ofiles[fd];
570 	if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) {
571 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
572 		if (type == 0) {
573 			suword(&job->_aiocb_private.status, -1);
574 			suword(&job->_aiocb_private.error, EBADF);
575 		}
576 		return EBADF;
577 	}
578 
579 	if (aiocbe->uaiocb.aio_offset == -1LL) {
580 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
581 		if (type == 0) {
582 			suword(&job->_aiocb_private.status, -1);
583 			suword(&job->_aiocb_private.error, EINVAL);
584 		}
585 		return EINVAL;
586 	}
587 
588 #ifdef DEBUGAIO
589 	printf("job addr: 0x%x, 0x%x, %d\n", job, &job->_aiocb_private.kernelinfo, jobrefid);
590 #endif
591 
592 	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
593 	if (error) {
594 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
595 		if (type == 0) {
596 			suword(&job->_aiocb_private.status, -1);
597 			suword(&job->_aiocb_private.error, EINVAL);
598 		}
599 		return error;
600 	}
601 
602 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)jobrefid;
603 #ifdef DEBUGAIO
604 	printf("aio_aqueue: New job: %d...  ", jobrefid);
605 #endif
606 	++jobrefid;
607 
608 	if (type != LIO_NOP) {
609 		aiocbe->uaiocb.aio_lio_opcode = type;
610 	}
611 
612 	opcode = aiocbe->uaiocb.aio_lio_opcode;
613 	if (opcode == LIO_NOP) {
614 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
615 		if (type == 0) {
616 			suword(&job->_aiocb_private.status, -1);
617 			suword(&job->_aiocb_private.error, 0);
618 		}
619 		return 0;
620 	}
621 
622 	if ((opcode != LIO_NOP) &&
623 		(opcode != LIO_READ) && (opcode != LIO_WRITE)) {
624 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
625 		if (type == 0) {
626 			suword(&job->_aiocb_private.status, -1);
627 			suword(&job->_aiocb_private.error, EINVAL);
628 		}
629 		return EINVAL;
630 	}
631 
632 	suword(&job->_aiocb_private.error, 0);
633 	suword(&job->_aiocb_private.status, 0);
634 	aiocbe->userproc = p;
635 	aiocbe->jobflags = 0;
636 	ki = p->p_aioinfo;
637 	++num_queue_count;
638 	++ki->kaio_queue_count;
639 
640 retryproc:
641 	if (aiop = TAILQ_FIRST(&aio_freeproc)) {
642 #ifdef DEBUGAIO
643 		printf("found a free AIO process\n");
644 #endif
645 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
646 		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
647 		aiop->aioprocflags &= ~AIOP_FREE;
648 		TAILQ_INSERT_TAIL(&aiop->jobtorun, aiocbe, list);
649 		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
650 		aiocbe->jobstate = JOBST_JOBQPROC;
651 		aiocbe->jobaioproc = aiop;
652 		wakeup(aiop->aioproc);
653 	} else if ((num_aio_procs < max_aio_procs) &&
654 			(ki->kaio_active_count < ki->kaio_maxactive_count)) {
655 		if (error = aio_newproc()) {
656 #ifdef DEBUGAIO
657 			printf("aio_aqueue: problem sleeping for starting proc: %d\n",
658 				error);
659 #endif
660 		}
661 		goto retryproc;
662 	} else {
663 #ifdef DEBUGAIO
664 		printf("queuing to global queue\n");
665 #endif
666 		TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
667 		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
668 		aiocbe->jobstate = JOBST_JOBQGLOBAL;
669 	}
670 
671 	return 0;
672 }
673 
674 static int
675 aio_aqueue(struct proc *p, struct aiocb *job, int type) {
676 	struct kaioinfo *ki;
677 
678 	if (p->p_aioinfo == NULL) {
679 		aio_init_aioinfo(p);
680 	}
681 
682 	if (num_queue_count >= max_queue_count)
683 		return EAGAIN;
684 
685 	ki = p->p_aioinfo;
686 	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
687 		return EAGAIN;
688 
689 	return _aio_aqueue(p, job, type);
690 }
691 
692 /*
693  * Support the aio_return system call
694  */
695 int
696 aio_return(struct proc *p, struct aio_return_args *uap, int *retval) {
697 	int jobref, status;
698 	struct aiocblist *cb;
699 	struct kaioinfo *ki;
700 	struct proc *userp;
701 
702 	ki = p->p_aioinfo;
703 	if (ki == NULL) {
704 		return EINVAL;
705 	}
706 
707 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
708 	if (jobref == -1)
709 		return EINVAL;
710 
711 
712 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
713 		cb;
714 		cb = TAILQ_NEXT(cb, plist)) {
715 		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
716 			retval[0] = cb->uaiocb._aiocb_private.status;
717 			aio_free_entry(cb);
718 			return 0;
719 		}
720 	}
721 
722 	status = fuword(&uap->aiocbp->_aiocb_private.status);
723 	if (status == -1)
724 		return 0;
725 
726 	return (EINVAL);
727 }
728 
729 /*
730  * Rundown the jobs for a given process.
731  */
732 void
733 aio_marksuspend(struct proc *p, int njobs, int *joblist, int set) {
734 	struct aiocblist *aiocbe;
735 	struct kaioinfo *ki;
736 
737 	ki = p->p_aioinfo;
738 	if (ki == NULL)
739 		return;
740 
741 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue);
742 		aiocbe;
743 		aiocbe = TAILQ_NEXT(aiocbe, plist)) {
744 
745 		if (njobs) {
746 
747 			int i;
748 
749 			for(i = 0; i < njobs; i++) {
750 				if (((int) aiocbe->uaiocb._aiocb_private.kernelinfo) == joblist[i])
751 					break;
752 			}
753 
754 			if (i == njobs)
755 				continue;
756 		}
757 
758 		if (set)
759 			aiocbe->jobflags |= AIOCBLIST_SUSPEND;
760 		else
761 			aiocbe->jobflags &= ~AIOCBLIST_SUSPEND;
762 	}
763 }
764 
765 /*
766  * Allow a process to wakeup when any of the I/O requests are
767  * completed.
768  */
769 int
770 aio_suspend(struct proc *p, struct aio_suspend_args *uap, int *retval) {
771 	struct timeval atv, utv;
772 	struct timespec ts;
773 	struct aiocb *const *cbptr, *cbp;
774 	struct kaioinfo *ki;
775 	struct aiocblist *cb;
776 	int i;
777 	int error, s, timo;
778 	int *joblist;
779 
780 
781 	timo = 0;
782 	if (uap->timeout) {
783 		/*
784 		 * Get timespec struct
785 		 */
786 		if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) {
787 			return error;
788 		}
789 
790 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
791 			return (EINVAL);
792 
793 		TIMESPEC_TO_TIMEVAL(&atv, &ts)
794 		if (itimerfix(&atv))
795 			return (EINVAL);
796 		/*
797 		 * XXX this is not as careful as settimeofday() about minimising
798 		 * interrupt latency.  The hzto() interface is inconvenient as usual.
799 		 */
800 		s = splclock();
801 		timevaladd(&atv, &time);
802 		timo = hzto(&atv);
803 		splx(s);
804 		if (timo == 0)
805 			timo = 1;
806 	}
807 
808 	ki = p->p_aioinfo;
809 	if (ki == NULL)
810 		return EAGAIN;
811 
812 	joblist = malloc(uap->nent * sizeof(int), M_TEMP, M_WAITOK);
813 	cbptr = uap->aiocbp;
814 
815 	for(i=0;i<uap->nent;i++) {
816 		cbp = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
817 #ifdef DEBUGAIO
818 		printf("cbp: %x\n", cbp);
819 #endif
820 		joblist[i] = fuword(&cbp->_aiocb_private.kernelinfo);
821 		cbptr++;
822 	}
823 
824 #ifdef DEBUGAIO
825 	printf("Suspend, timeout: %d clocks, jobs:", timo);
826 	for(i=0;i<uap->nent;i++)
827 		printf(" %d", joblist[i]);
828 	printf("\n");
829 #endif
830 
831 	while (1) {
832 		for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
833 			cb;
834 			cb = TAILQ_NEXT(cb, plist)) {
835 			for(i=0;i<uap->nent;i++) {
836 				if (((int) cb->uaiocb._aiocb_private.kernelinfo) == joblist[i]) {
837 					free(joblist, M_TEMP);
838 					return 0;
839 				}
840 			}
841 		}
842 
843 		aio_marksuspend(p, uap->nent, joblist, 1);
844 #ifdef DEBUGAIO
845 		printf("Suspending -- waiting for all I/O's to complete: ");
846 		for(i=0;i<uap->nent;i++)
847 			printf(" %d", joblist[i]);
848 		printf("\n");
849 #endif
850 		error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo);
851 		aio_marksuspend(p, uap->nent, joblist, 0);
852 
853 		if (error == EINTR) {
854 #ifdef DEBUGAIO
855 			printf(" signal\n");
856 #endif
857 			free(joblist, M_TEMP);
858 			return EINTR;
859 		} else if (error == EWOULDBLOCK) {
860 #ifdef DEBUGAIO
861 			printf(" timeout\n");
862 #endif
863 			free(joblist, M_TEMP);
864 			return EAGAIN;
865 		}
866 #ifdef DEBUGAIO
867 		printf("\n");
868 #endif
869 	}
870 
871 /* NOTREACHED */
872 	return EINVAL;
873 }
874 
875 /*
876  * aio_cancel at the kernel level is a NOOP right now.  It
877  * might be possible to support it partially in user mode, or
878  * in kernel mode later on.
879  */
880 int
881 aio_cancel(struct proc *p, struct aio_cancel_args *uap, int *retval) {
882 	return AIO_NOTCANCELLED;
883 }
884 
885 /*
886  * aio_error is implemented in the kernel level for compatibility
887  * purposes only.  For a user mode async implementation, it would be
888  * best to do it in a userland subroutine.
889  */
890 int
891 aio_error(struct proc *p, struct aio_error_args *uap, int *retval) {
892 	int activeflag, errorcode;
893 	struct aiocblist *cb;
894 	struct kaioinfo *ki;
895 	int jobref;
896 	int error, status;
897 
898 	ki = p->p_aioinfo;
899 	if (ki == NULL)
900 		return EINVAL;
901 
902 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
903 	if (jobref == -1)
904 		return EFAULT;
905 
906 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
907 		cb;
908 		cb = TAILQ_NEXT(cb, plist)) {
909 
910 		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
911 			retval[0] = cb->uaiocb._aiocb_private.error;
912 			return 0;
913 		}
914 	}
915 
916 	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue);
917 		cb;
918 		cb = TAILQ_NEXT(cb, plist)) {
919 
920 		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
921 			retval[0] = EINPROGRESS;
922 			return 0;
923 		}
924 	}
925 
926 	/*
927 	 * Hack for lio
928 	 */
929 	status = fuword(&uap->aiocbp->_aiocb_private.status);
930 	if (status == -1) {
931 		return fuword(&uap->aiocbp->_aiocb_private.error);
932 	}
933 	return EINVAL;
934 }
935 
936 int
937 aio_read(struct proc *p, struct aio_read_args *uap, int *retval) {
938 	struct filedesc *fdp;
939 	struct file *fp;
940 	struct uio auio;
941 	struct iovec aiov;
942 	unsigned int fd;
943 	int cnt;
944 	struct aiocb iocb;
945 	int error, pmodes;
946 
947 	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
948 	if ((pmodes & AIO_PMODE_SYNC) == 0) {
949 		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
950 	}
951 
952 	/*
953 	 * Get control block
954 	 */
955 	if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
956 		return error;
957 
958 	/*
959 	 * Get the fd info for process
960 	 */
961 	fdp = p->p_fd;
962 
963 	/*
964 	 * Range check file descriptor
965 	 */
966 	fd = iocb.aio_fildes;
967 	if (fd >= fdp->fd_nfiles)
968 		return EBADF;
969 	fp = fdp->fd_ofiles[fd];
970 	if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
971 		return EBADF;
972 	if (iocb.aio_offset == -1LL)
973 		return EINVAL;
974 
975 	auio.uio_resid = iocb.aio_nbytes;
976 	if (auio.uio_resid < 0)
977 		return (EINVAL);
978 
979 	/*
980 	 * Process sync simply -- queue async request.
981 	 */
982 	if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) {
983 		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
984 	}
985 
986 	aiov.iov_base = iocb.aio_buf;
987 	aiov.iov_len = iocb.aio_nbytes;
988 
989 	auio.uio_iov = &aiov;
990 	auio.uio_iovcnt = 1;
991 	auio.uio_offset = iocb.aio_offset;
992 	auio.uio_rw = UIO_READ;
993 	auio.uio_segflg = UIO_USERSPACE;
994 	auio.uio_procp = p;
995 
996 	cnt = iocb.aio_nbytes;
997 	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
998 	if (error &&
999 		(auio.uio_resid != cnt) &&
1000 		(error == ERESTART || error == EINTR || error == EWOULDBLOCK))
1001 			error = 0;
1002 	cnt -= auio.uio_resid;
1003 	*retval = cnt;
1004 	return error;
1005 }
1006 
1007 int
1008 aio_write(struct proc *p, struct aio_write_args *uap, int *retval) {
1009 	struct filedesc *fdp;
1010 	struct file *fp;
1011 	struct uio auio;
1012 	struct iovec aiov;
1013 	unsigned int fd;
1014 	int cnt;
1015 	struct aiocb iocb;
1016 	int error;
1017 	int pmodes;
1018 
1019 	/*
1020 	 * Process sync simply -- queue async request.
1021 	 */
1022 	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1023 	if ((pmodes & AIO_PMODE_SYNC) == 0) {
1024 		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE);
1025 	}
1026 
1027 	if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
1028 		return error;
1029 
1030 	/*
1031 	 * Get the fd info for process
1032 	 */
1033 	fdp = p->p_fd;
1034 
1035 	/*
1036 	 * Range check file descriptor
1037 	 */
1038 	fd = iocb.aio_fildes;
1039 	if (fd >= fdp->fd_nfiles)
1040 		return EBADF;
1041 	fp = fdp->fd_ofiles[fd];
1042 	if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
1043 		return EBADF;
1044 	if (iocb.aio_offset == -1LL)
1045 		return EINVAL;
1046 
1047 	aiov.iov_base = iocb.aio_buf;
1048 	aiov.iov_len = iocb.aio_nbytes;
1049 	auio.uio_iov = &aiov;
1050 	auio.uio_iovcnt = 1;
1051 	auio.uio_offset = iocb.aio_offset;
1052 
1053 	auio.uio_resid = iocb.aio_nbytes;
1054 	if (auio.uio_resid < 0)
1055 		return (EINVAL);
1056 
1057 	auio.uio_rw = UIO_WRITE;
1058 	auio.uio_segflg = UIO_USERSPACE;
1059 	auio.uio_procp = p;
1060 
1061 	cnt = iocb.aio_nbytes;
1062 	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
1063 	if (error) {
1064 		if (auio.uio_resid != cnt) {
1065 			if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
1066 				error = 0;
1067 			if (error == EPIPE)
1068 				psignal(p, SIGPIPE);
1069 		}
1070 	}
1071 	cnt -= auio.uio_resid;
1072 	*retval = cnt;
1073 	return error;
1074 }
1075 
1076 int
1077 lio_listio(struct proc *p, struct lio_listio_args *uap, int *retval) {
1078 	int cnt, nent, nentqueued;
1079 	struct aiocb *iocb, * const *cbptr;
1080 	struct aiocblist *cb;
1081 	struct kaioinfo *ki;
1082 	int error, runningcode;
1083 	int i;
1084 
1085 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1086 		return EINVAL;
1087 
1088 	nent = uap->nent;
1089 	if (nent > AIO_LISTIO_MAX)
1090 		return EINVAL;
1091 
1092 	if (p->p_aioinfo == NULL) {
1093 		aio_init_aioinfo(p);
1094 	}
1095 
1096 	if ((nent + num_queue_count) > max_queue_count)
1097 		return EAGAIN;
1098 
1099 	ki = p->p_aioinfo;
1100 	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1101 		return EAGAIN;
1102 
1103 /*
1104  * reserve resources, remember that we have to unwind part of them sometimes
1105  */
1106 	num_queue_count += nent;
1107 	ki->kaio_queue_count += nent;
1108 	nentqueued = 0;
1109 
1110 /*
1111  * get pointers to the list of I/O requests
1112 	iocbvec = malloc(uap->nent * sizeof(struct aiocb *), M_TEMP, M_WAITOK);
1113  */
1114 
1115 	cbptr = uap->acb_list;
1116 	for(i = 0; i < uap->nent; i++) {
1117 		iocb = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
1118 		error = aio_aqueue(p, iocb, 0);
1119 		if (error == 0)
1120 			nentqueued++;
1121 	}
1122 
1123 	if (nentqueued == 0)
1124 		return EIO;
1125 
1126 	runningcode = 0;
1127 	if (nentqueued != nent)
1128 		runningcode = EIO;
1129 
1130 	if (uap->mode == LIO_WAIT) {
1131 		while (1) {
1132 			for(i = 0; i < uap->nent; i++) {
1133 				int found;
1134 				int jobref, command, status;
1135 
1136 				iocb = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
1137 				command = fuword(&iocb->aio_lio_opcode);
1138 				if (command == LIO_NOP)
1139 					continue;
1140 
1141 				status = fuword(&iocb->_aiocb_private.status);
1142 				if (status == -1)
1143 					continue;
1144 				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
1145 
1146 				found = 0;
1147 				for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1148 					cb;
1149 					cb = TAILQ_NEXT(cb, plist)) {
1150 					if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1151 						found++;
1152 						break;
1153 					}
1154 				}
1155 				if (found == 0)
1156 					break;
1157 			}
1158 
1159 			if (i == uap->nent) {
1160 				return runningcode;
1161 			}
1162 
1163 			aio_marksuspend(p, 0, 0, 1);
1164 			error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0);
1165 			aio_marksuspend(p, 0, 0, 0);
1166 
1167 			if (error == EINTR) {
1168 				return EINTR;
1169 			} else if (error == EWOULDBLOCK) {
1170 				return EAGAIN;
1171 			}
1172 
1173 		}
1174 	}
1175 
1176 	return runningcode;
1177 }
1178