xref: /freebsd/sys/kern/vfs_aio.c (revision 0de89efe5c443f213c7ea28773ef2dc6cf3af2ed)
1 /*
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  *
16  * $Id: vfs_aio.c,v 1.3 1997/07/17 04:49:31 dyson Exp $
17  */
18 
19 /*
20  * This file contains support for the POSIX.4 AIO facility.
21  *
22  * The initial version provides only the (bogus) synchronous semantics
23  * but will support async in the future.  Note that a bit
24  * in a private field allows the user mode subroutine to adapt
25  * the kernel operations to true POSIX.4 for future compatibility.
26  *
27  * This code is used to support true POSIX.4 AIO/LIO with the help
28  * of a user mode subroutine package.  Note that eventually more support
29  * will be pushed into the kernel.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/sysproto.h>
35 #include <sys/filedesc.h>
36 #include <sys/kernel.h>
37 #include <sys/fcntl.h>
38 #include <sys/file.h>
39 #include <sys/unistd.h>
40 #include <sys/vnode.h>
41 #include <sys/proc.h>
42 #include <sys/uio.h>
43 #include <sys/malloc.h>
44 #include <sys/signalvar.h>
45 
46 #include <vm/vm.h>
47 #include <vm/vm_param.h>
48 #include <vm/vm_extern.h>
49 #include <vm/pmap.h>
50 #include <vm/vm_map.h>
51 #include <sys/aio.h>
52 #include <sys/shm.h>
53 
54 #include <machine/cpu.h>
55 
56 #define AIOCBLIST_CANCELLED	0x1
57 #define AIOCBLIST_RUNDOWN	0x4
58 #define AIOCBLIST_ASYNCFREE	0x8
59 #define AIOCBLIST_SUSPEND	0x10
60 
61 #if 0
62 #define DEBUGAIO
63 #define DIAGNOSTIC
64 #endif
65 
66 static	int jobrefid;
67 
68 #define JOBST_NULL		0x0
69 #define	JOBST_JOBQPROC		0x1
70 #define JOBST_JOBQGLOBAL	0x2
71 #define JOBST_JOBRUNNING	0x3
72 #define JOBST_JOBFINISHED	0x4
73 
74 #define MAX_AIO_PER_PROC	32
75 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
76 #define MAX_AIO_PROCS		128
77 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
78 #define TARGET_AIO_PROCS	64
79 
80 /*
81  * Job queue item
82  */
83 struct aiocblist {
84 	TAILQ_ENTRY (aiocblist) list;		/* List of jobs */
85 	TAILQ_ENTRY (aiocblist) plist;		/* List of jobs for proc */
86 	int	jobflags;
87 	int	jobstate;
88 	struct	proc *userproc;			/* User process */
89 	struct	aioproclist	*jobaioproc;	/* AIO process descriptor */
90 	struct	aiocb uaiocb;			/* Kernel I/O control block */
91 };
92 
93 #define AIOP_FREE	0x1			/* proc on free queue */
94 /*
95  * AIO process info
96  */
97 struct aioproclist {
98 	int aioprocflags;			/* AIO proc flags */
99 	TAILQ_ENTRY(aioproclist) list;		/* List of processes */
100 	struct proc *aioproc;			/* The AIO thread */
101 	TAILQ_HEAD (,aiocblist) jobtorun;	/* suggested job to run */
102 };
103 
104 struct kaioinfo {
105 	int	kaio_maxactive_count;	/* maximum number of AIOs */
106 	int	kaio_active_count;	/* number of currently used AIOs */
107 	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
108 	int	kaio_queue_count;	/* size of AIO queue */
109 	TAILQ_HEAD (,aiocblist)	kaio_jobqueue;	/* job queue for process */
110 	TAILQ_HEAD (,aiocblist)	kaio_jobdone;	/* done queue for process */
111 };
112 
113 TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc;
114 TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
115 TAILQ_HEAD(,aiocblist) aio_freejobs;
116 
117 int max_aio_procs = MAX_AIO_PROCS;
118 int num_aio_procs = 0;
119 int target_aio_procs = TARGET_AIO_PROCS;
120 
121 int max_queue_count = MAX_AIO_QUEUE;
122 int num_queue_count = 0;
123 
124 void aio_init_aioinfo(struct proc *p) ;
125 void aio_onceonly(void *) ;
126 int aio_free_entry(struct aiocblist *aiocbe);
127 void aio_cancel_internal(struct aiocblist *aiocbe);
128 void aio_process(struct aiocblist *aiocbe);
129 void pmap_newvmspace(struct vmspace *);
130 static int aio_newproc(void) ;
131 static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ;
132 static void aio_marksuspend(struct proc *p, int njobs, int *joblist, int set) ;
133 
134 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
135 
136 /*
137  * Startup initialization
138  */
139 void
140 aio_onceonly(void *na) {
141 	TAILQ_INIT(&aio_freeproc);
142 	TAILQ_INIT(&aio_activeproc);
143 	TAILQ_INIT(&aio_jobs);
144 	TAILQ_INIT(&aio_freejobs);
145 }
146 
147 /*
148  * Init the per-process aioinfo structure.
149  */
150 void
151 aio_init_aioinfo(struct proc *p) {
152 	struct kaioinfo *ki;
153 	if (p->p_aioinfo == NULL) {
154 		ki = malloc(sizeof (struct kaioinfo), M_AIO, M_WAITOK);
155 		p->p_aioinfo = ki;
156 		ki->kaio_maxactive_count = MAX_AIO_PER_PROC;
157 		ki->kaio_active_count = 0;
158 		ki->kaio_qallowed_count = MAX_AIO_QUEUE_PER_PROC;
159 		ki->kaio_queue_count = 0;
160 		TAILQ_INIT(&ki->kaio_jobdone);
161 		TAILQ_INIT(&ki->kaio_jobqueue);
162 	}
163 }
164 
165 /*
166  * Free a job entry.  Wait for completion if it is currently
167  * active, but don't delay forever.  If we delay, we return
168  * a flag that says that we have to restart the queue scan.
169  */
170 int
171 aio_free_entry(struct aiocblist *aiocbe) {
172 	struct kaioinfo *ki;
173 	struct aioproclist *aiop;
174 	struct proc *p;
175 
176 	if (aiocbe->jobstate == JOBST_NULL)
177 		panic("aio_free_entry: freeing already free job");
178 
179 	p = aiocbe->userproc;
180 	ki = p->p_aioinfo;
181 	if (ki == NULL)
182 		panic("aio_free_entry: missing p->p_aioinfo");
183 
184 	if (aiocbe->jobstate == JOBST_JOBRUNNING) {
185 		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
186 			return 0;
187 		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
188 		if (tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", hz*5)) {
189 			aiocbe->jobflags |= AIOCBLIST_ASYNCFREE;
190 			aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
191 			return 1;
192 		}
193 		aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
194 	}
195 	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
196 
197 	if (ki->kaio_queue_count <= 0)
198 		panic("aio_free_entry: process queue size <= 0");
199 	if (num_queue_count <= 0)
200 		panic("aio_free_entry: system wide queue size <= 0");
201 
202 	--ki->kaio_queue_count;
203 	--num_queue_count;
204 
205 	if ( aiocbe->jobstate == JOBST_JOBQPROC) {
206 		aiop = aiocbe->jobaioproc;
207 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
208 	} else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) {
209 		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
210 	} else if ( aiocbe->jobstate == JOBST_JOBFINISHED) {
211 		ki = p->p_aioinfo;
212 		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
213 	}
214 	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
215 	aiocbe->jobstate = JOBST_NULL;
216 	return 0;
217 }
218 
219 /*
220  * Rundown the jobs for a given process.
221  */
222 void
223 aio_proc_rundown(struct proc *p) {
224 	struct kaioinfo *ki;
225 	struct aiocblist *aiocbe, *aiocbn;
226 
227 	ki = p->p_aioinfo;
228 	if (ki == NULL)
229 		return;
230 
231 restart1:
232 	for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone);
233 		aiocbe;
234 		aiocbe = aiocbn) {
235 		aiocbn = TAILQ_NEXT(aiocbe, plist);
236 		if (aio_free_entry(aiocbe))
237 			goto restart1;
238 	}
239 
240 restart2:
241 	for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue);
242 		aiocbe;
243 		aiocbe = aiocbn) {
244 		aiocbn = TAILQ_NEXT(aiocbe, plist);
245 		if (aio_free_entry(aiocbe))
246 			goto restart2;
247 	}
248 	free(ki, M_AIO);
249 }
250 
251 /*
252  * Select a job to run (called by an AIO daemon)
253  */
254 static struct aiocblist *
255 aio_selectjob(struct aioproclist *aiop) {
256 
257 	struct aiocblist *aiocbe;
258 
259 	aiocbe = TAILQ_FIRST(&aiop->jobtorun);
260 	if (aiocbe) {
261 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
262 		return aiocbe;
263 	}
264 
265 	for (aiocbe = TAILQ_FIRST(&aio_jobs);
266 		aiocbe;
267 		aiocbe = TAILQ_NEXT(aiocbe, list)) {
268 		struct kaioinfo *ki;
269 		struct proc *userp;
270 
271 		userp = aiocbe->userproc;
272 		ki = userp->p_aioinfo;
273 
274 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
275 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
276 			return aiocbe;
277 		}
278 	}
279 
280 	return NULL;
281 }
282 
283 /*
284  * The AIO activity proper.
285  */
286 void
287 aio_process(struct aiocblist *aiocbe) {
288 	struct filedesc *fdp;
289 	struct proc *userp;
290 	struct aiocb *cb;
291 	struct file *fp;
292 	struct uio auio;
293 	struct iovec aiov;
294 	unsigned int fd;
295 	int cnt;
296 	int error;
297 
298 	userp = aiocbe->userproc;
299 	cb = &aiocbe->uaiocb;
300 
301 #ifdef DEBUGAIO
302 	printf("fd: %d, offset: 0x%x, address: 0x%x, size: %d\n",
303 		cb->aio_fildes, (int) cb->aio_offset,
304 			cb->aio_buf, cb->aio_nbytes);
305 	tsleep(curproc, PVM, "aioprc", hz);
306 #endif
307 	fdp = curproc->p_fd;
308 	/*
309 	 * Range check file descriptor
310 	 */
311 	fd = cb->aio_fildes;
312 	fp = fdp->fd_ofiles[fd];
313 
314 	aiov.iov_base = cb->aio_buf;
315 	aiov.iov_len = cb->aio_nbytes;
316 
317 	auio.uio_iov = &aiov;
318 	auio.uio_iovcnt = 1;
319 	auio.uio_offset = cb->aio_offset;
320 	auio.uio_resid = cb->aio_nbytes;
321 	cnt = cb->aio_nbytes;
322 	auio.uio_segflg = UIO_USERSPACE;
323 	auio.uio_procp = curproc;
324 
325 	if (cb->aio_lio_opcode == LIO_READ) {
326 		auio.uio_rw = UIO_READ;
327 		error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
328 	} else {
329 		auio.uio_rw = UIO_WRITE;
330 		error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
331 	}
332 
333 	if (error) {
334 		if (auio.uio_resid != cnt) {
335 			if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
336 				error = 0;
337 			if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
338 				psignal(userp, SIGPIPE);
339 		}
340 	}
341 
342 	cnt -= auio.uio_resid;
343 	cb->_aiocb_private.error = error;
344 	cb->_aiocb_private.status = cnt;
345 
346 	return;
347 
348 }
349 
350 /*
351  * The AIO daemon.
352  */
353 static void
354 aio_startproc(void *uproc)
355 {
356 	struct aioproclist *aiop;
357 
358 	/*
359 	 * Allocate and ready the aio control info
360 	 */
361 	aiop = malloc(sizeof *aiop, M_AIO, M_WAITOK);
362 	aiop->aioproc = curproc;
363 	aiop->aioprocflags |= AIOP_FREE;
364 	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
365 	TAILQ_INIT(&aiop->jobtorun);
366 
367 	/*
368 	 * Get rid of current address space
369 	 */
370 	if (curproc->p_vmspace->vm_refcnt == 1) {
371 		if (curproc->p_vmspace->vm_shm)
372 			shmexit(curproc);
373 		pmap_remove_pages(&curproc->p_vmspace->vm_pmap, 0, USRSTACK);
374 		vm_map_remove(&curproc->p_vmspace->vm_map, 0, USRSTACK);
375 	} else {
376 		vmspace_exec(curproc);
377 	}
378 
379 	/*
380 	 * Make up a name for the daemon
381 	 */
382 	strcpy(curproc->p_comm, "aiodaemon");
383 
384 	/*
385 	 * Get rid of our current filedescriptors
386 	 */
387 	fdfree(curproc);
388 	curproc->p_fd = NULL;
389 	curproc->p_ucred = crcopy(curproc->p_ucred);
390 	curproc->p_ucred->cr_uid = 0;
391 	curproc->p_ucred->cr_groups[0] = 1;
392 	curproc->p_flag |= P_SYSTEM;
393 
394 #ifdef DEBUGAIO
395 	printf("Started new process: %d\n", curproc->p_pid);
396 #endif
397 	wakeup(uproc);
398 
399 	while(1) {
400 		struct vmspace *myvm, *tmpvm;
401 		struct proc *cp = curproc;
402 		struct proc *up = NULL;
403 		struct	aiocblist *aiocbe;
404 
405 		if ((aiop->aioprocflags & AIOP_FREE) == 0) {
406 			TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
407 			aiop->aioprocflags |= AIOP_FREE;
408 		}
409 		tsleep(curproc, PZERO, "aiordy", 0);
410 		if (aiop->aioprocflags & AIOP_FREE) {
411 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
412 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
413 			aiop->aioprocflags &= ~AIOP_FREE;
414 		}
415 
416 		myvm = curproc->p_vmspace;
417 
418 		while ( aiocbe = aio_selectjob(aiop)) {
419 			struct aiocb *cb;
420 			struct kaioinfo *ki;
421 			struct proc *userp;
422 
423 			cb = &aiocbe->uaiocb;
424 			userp = aiocbe->userproc;
425 			ki = userp->p_aioinfo;
426 
427 			aiocbe->jobstate = JOBST_JOBRUNNING;
428 			if (userp != cp) {
429 				tmpvm = curproc->p_vmspace;
430 				curproc->p_vmspace = userp->p_vmspace;
431 				++curproc->p_vmspace->vm_refcnt;
432 				pmap_activate(curproc);
433 				if (tmpvm != myvm) {
434 					vmspace_free(tmpvm);
435 				}
436 				if (curproc->p_fd)
437 					fdfree(curproc);
438 				curproc->p_fd = fdshare(userp);
439 				cp = userp;
440 			}
441 
442 			ki->kaio_active_count++;
443 			aiocbe->jobaioproc = aiop;
444 			aio_process(aiocbe);
445 			--ki->kaio_active_count;
446 
447 			aiocbe->jobstate = JOBST_JOBFINISHED;
448 
449 			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
450 				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
451 				TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
452 			} else {
453 				TAILQ_REMOVE(&ki->kaio_jobqueue,
454 					aiocbe, plist);
455 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone,
456 					aiocbe, plist);
457 			}
458 
459 			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
460 				wakeup(aiocbe);
461 				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
462 			}
463 
464 			if (aiocbe->jobflags & AIOCBLIST_SUSPEND) {
465 				wakeup(userp);
466 				aiocbe->jobflags &= ~AIOCBLIST_SUSPEND;
467 			}
468 
469 			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
470 				psignal(userp, cb->aio_sigevent.sigev_signo);
471 			}
472 		}
473 
474 		if (cp != curproc) {
475 			tmpvm = curproc->p_vmspace;
476 			curproc->p_vmspace = myvm;
477 			pmap_activate(curproc);
478 			vmspace_free(tmpvm);
479 			if (curproc->p_fd)
480 				fdfree(curproc);
481 			curproc->p_fd = NULL;
482 			cp = curproc;
483 		}
484 	}
485 }
486 
487 /*
488  * Create a new AIO daemon.
489  */
490 static int
491 aio_newproc() {
492 	int error;
493 	int rval[2];
494 	struct rfork_args rfa;
495 	struct proc *p;
496 
497 	rfa.flags = RFMEM | RFPROC | RFCFDG;
498 
499 	if (error = rfork(curproc, &rfa, &rval[0]))
500 		return error;
501 
502 	cpu_set_fork_handler(p = pfind(rval[0]), aio_startproc, curproc);
503 
504 #ifdef DEBUGAIO
505 	printf("Waiting for new process: %d, count: %d\n",
506 		curproc->p_pid, num_aio_procs);
507 #endif
508 
509 	error = tsleep(curproc, PZERO, "aiosta", 5*hz);
510 	++num_aio_procs;
511 
512 	return error;
513 
514 }
515 
516 /*
517  * Queue a new AIO request.
518  */
519 static int
520 _aio_aqueue(struct proc *p, struct aiocb *job, int type) {
521 	struct filedesc *fdp;
522 	struct file *fp;
523 	unsigned int fd;
524 
525 	int error;
526 	int opcode;
527 	struct aiocblist *aiocbe;
528 	struct aioproclist *aiop;
529 	struct kaioinfo *ki;
530 
531 	if (aiocbe = TAILQ_FIRST(&aio_freejobs)) {
532 		TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
533 	} else {
534 		aiocbe = malloc (sizeof *aiocbe, M_AIO, M_WAITOK);
535 	}
536 
537 	error = copyin((caddr_t)job,
538 		(caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb);
539 	if (error) {
540 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
541 		return error;
542 	}
543 
544 
545 	/*
546 	 * Get the fd info for process
547 	 */
548 	fdp = p->p_fd;
549 
550 	/*
551 	 * Range check file descriptor
552 	 */
553 	fd = aiocbe->uaiocb.aio_fildes;
554 	if (fd >= fdp->fd_nfiles) {
555 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
556 		if (type == 0) {
557 			suword(&job->_aiocb_private.status, -1);
558 			suword(&job->_aiocb_private.error, EBADF);
559 		}
560 		return EBADF;
561 	}
562 
563 	fp = fdp->fd_ofiles[fd];
564 	if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) {
565 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
566 		if (type == 0) {
567 			suword(&job->_aiocb_private.status, -1);
568 			suword(&job->_aiocb_private.error, EBADF);
569 		}
570 		return EBADF;
571 	}
572 
573 	if (aiocbe->uaiocb.aio_offset == -1LL) {
574 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
575 		if (type == 0) {
576 			suword(&job->_aiocb_private.status, -1);
577 			suword(&job->_aiocb_private.error, EINVAL);
578 		}
579 		return EINVAL;
580 	}
581 
582 #ifdef DEBUGAIO
583 	printf("job addr: 0x%x, 0x%x, %d\n", job, &job->_aiocb_private.kernelinfo, jobrefid);
584 #endif
585 
586 	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
587 	if (error) {
588 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
589 		if (type == 0) {
590 			suword(&job->_aiocb_private.status, -1);
591 			suword(&job->_aiocb_private.error, EINVAL);
592 		}
593 		return error;
594 	}
595 
596 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)jobrefid;
597 #ifdef DEBUGAIO
598 	printf("aio_aqueue: New job: %d...  ", jobrefid);
599 #endif
600 	++jobrefid;
601 
602 	if (type != LIO_NOP) {
603 		aiocbe->uaiocb.aio_lio_opcode = type;
604 	}
605 
606 	opcode = aiocbe->uaiocb.aio_lio_opcode;
607 	if (opcode == LIO_NOP) {
608 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
609 		if (type == 0) {
610 			suword(&job->_aiocb_private.status, -1);
611 			suword(&job->_aiocb_private.error, 0);
612 		}
613 		return 0;
614 	}
615 
616 	if ((opcode != LIO_NOP) &&
617 		(opcode != LIO_READ) && (opcode != LIO_WRITE)) {
618 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
619 		if (type == 0) {
620 			suword(&job->_aiocb_private.status, -1);
621 			suword(&job->_aiocb_private.error, EINVAL);
622 		}
623 		return EINVAL;
624 	}
625 
626 	suword(&job->_aiocb_private.error, 0);
627 	suword(&job->_aiocb_private.status, 0);
628 	aiocbe->userproc = p;
629 	aiocbe->jobflags = 0;
630 	ki = p->p_aioinfo;
631 	++num_queue_count;
632 	++ki->kaio_queue_count;
633 
634 retryproc:
635 	if (aiop = TAILQ_FIRST(&aio_freeproc)) {
636 #ifdef DEBUGAIO
637 		printf("found a free AIO process\n");
638 #endif
639 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
640 		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
641 		aiop->aioprocflags &= ~AIOP_FREE;
642 		TAILQ_INSERT_TAIL(&aiop->jobtorun, aiocbe, list);
643 		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
644 		aiocbe->jobstate = JOBST_JOBQPROC;
645 		aiocbe->jobaioproc = aiop;
646 		wakeup(aiop->aioproc);
647 	} else if ((num_aio_procs < max_aio_procs) &&
648 			(ki->kaio_active_count < ki->kaio_maxactive_count)) {
649 		if (error = aio_newproc()) {
650 #ifdef DEBUGAIO
651 			printf("aio_aqueue: problem sleeping for starting proc: %d\n",
652 				error);
653 #endif
654 		}
655 		goto retryproc;
656 	} else {
657 #ifdef DEBUGAIO
658 		printf("queuing to global queue\n");
659 #endif
660 		TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
661 		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
662 		aiocbe->jobstate = JOBST_JOBQGLOBAL;
663 	}
664 
665 	return 0;
666 }
667 
668 static int
669 aio_aqueue(struct proc *p, struct aiocb *job, int type) {
670 	struct kaioinfo *ki;
671 
672 	if (p->p_aioinfo == NULL) {
673 		aio_init_aioinfo(p);
674 	}
675 
676 	if (num_queue_count >= max_queue_count)
677 		return EAGAIN;
678 
679 	ki = p->p_aioinfo;
680 	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
681 		return EAGAIN;
682 
683 	return _aio_aqueue(p, job, type);
684 }
685 
686 /*
687  * Support the aio_return system call
688  */
689 int
690 aio_return(struct proc *p, struct aio_return_args *uap, int *retval) {
691 	int jobref, status;
692 	struct aiocblist *cb;
693 	struct kaioinfo *ki;
694 	struct proc *userp;
695 
696 	ki = p->p_aioinfo;
697 	if (ki == NULL) {
698 		return EINVAL;
699 	}
700 
701 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
702 	if (jobref == -1)
703 		return EINVAL;
704 
705 
706 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
707 		cb;
708 		cb = TAILQ_NEXT(cb, plist)) {
709 		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
710 			retval[0] = cb->uaiocb._aiocb_private.status;
711 			aio_free_entry(cb);
712 			return 0;
713 		}
714 	}
715 
716 	status = fuword(&uap->aiocbp->_aiocb_private.status);
717 	if (status == -1)
718 		return 0;
719 
720 	return (EINVAL);
721 }
722 
723 /*
724  * Rundown the jobs for a given process.
725  */
726 void
727 aio_marksuspend(struct proc *p, int njobs, int *joblist, int set) {
728 	struct aiocblist *aiocbe;
729 	struct kaioinfo *ki;
730 
731 	ki = p->p_aioinfo;
732 	if (ki == NULL)
733 		return;
734 
735 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue);
736 		aiocbe;
737 		aiocbe = TAILQ_NEXT(aiocbe, plist)) {
738 
739 		if (njobs) {
740 
741 			int i;
742 
743 			for(i = 0; i < njobs; i++) {
744 				if (((int) aiocbe->uaiocb._aiocb_private.kernelinfo) == joblist[i])
745 					break;
746 			}
747 
748 			if (i == njobs)
749 				continue;
750 		}
751 
752 		if (set)
753 			aiocbe->jobflags |= AIOCBLIST_SUSPEND;
754 		else
755 			aiocbe->jobflags &= ~AIOCBLIST_SUSPEND;
756 	}
757 }
758 
759 /*
760  * Allow a process to wakeup when any of the I/O requests are
761  * completed.
762  */
763 int
764 aio_suspend(struct proc *p, struct aio_suspend_args *uap, int *retval) {
765 	struct timeval atv, utv;
766 	struct timespec ts;
767 	struct aiocb *const *cbptr, *cbp;
768 	struct kaioinfo *ki;
769 	struct aiocblist *cb;
770 	int i;
771 	int error, s, timo;
772 	int *joblist;
773 
774 
775 	timo = 0;
776 	if (uap->timeout) {
777 		/*
778 		 * Get timespec struct
779 		 */
780 		if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) {
781 			return error;
782 		}
783 
784 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
785 			return (EINVAL);
786 
787 		TIMESPEC_TO_TIMEVAL(&atv, &ts)
788 		if (itimerfix(&atv))
789 			return (EINVAL);
790 		/*
791 		 * XXX this is not as careful as settimeofday() about minimising
792 		 * interrupt latency.  The hzto() interface is inconvenient as usual.
793 		 */
794 		s = splclock();
795 		timevaladd(&atv, &time);
796 		timo = hzto(&atv);
797 		splx(s);
798 		if (timo == 0)
799 			timo = 1;
800 	}
801 
802 	ki = p->p_aioinfo;
803 	if (ki == NULL)
804 		return EAGAIN;
805 
806 	joblist = malloc(uap->nent * sizeof(int), M_TEMP, M_WAITOK);
807 	cbptr = uap->aiocbp;
808 
809 	for(i=0;i<uap->nent;i++) {
810 		cbp = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
811 #ifdef DEBUGAIO
812 		printf("cbp: %x\n", cbp);
813 #endif
814 		joblist[i] = fuword(&cbp->_aiocb_private.kernelinfo);
815 		cbptr++;
816 	}
817 
818 #ifdef DEBUGAIO
819 	printf("Suspend, timeout: %d clocks, jobs:", timo);
820 	for(i=0;i<uap->nent;i++)
821 		printf(" %d", joblist[i]);
822 	printf("\n");
823 #endif
824 
825 	while (1) {
826 		for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
827 			cb;
828 			cb = TAILQ_NEXT(cb, plist)) {
829 			for(i=0;i<uap->nent;i++) {
830 				if (((int) cb->uaiocb._aiocb_private.kernelinfo) == joblist[i]) {
831 					free(joblist, M_TEMP);
832 					return 0;
833 				}
834 			}
835 		}
836 
837 		aio_marksuspend(p, uap->nent, joblist, 1);
838 #ifdef DEBUGAIO
839 		printf("Suspending -- waiting for all I/O's to complete: ");
840 		for(i=0;i<uap->nent;i++)
841 			printf(" %d", joblist[i]);
842 		printf("\n");
843 #endif
844 		error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo);
845 		aio_marksuspend(p, uap->nent, joblist, 0);
846 
847 		if (error == EINTR) {
848 #ifdef DEBUGAIO
849 			printf(" signal\n");
850 #endif
851 			free(joblist, M_TEMP);
852 			return EINTR;
853 		} else if (error == EWOULDBLOCK) {
854 #ifdef DEBUGAIO
855 			printf(" timeout\n");
856 #endif
857 			free(joblist, M_TEMP);
858 			return EAGAIN;
859 		}
860 #ifdef DEBUGAIO
861 		printf("\n");
862 #endif
863 	}
864 
865 /* NOTREACHED */
866 	return EINVAL;
867 }
868 
869 /*
870  * aio_cancel at the kernel level is a NOOP right now.  It
871  * might be possible to support it partially in user mode, or
872  * in kernel mode later on.
873  */
874 int
875 aio_cancel(struct proc *p, struct aio_cancel_args *uap, int *retval) {
876 	return AIO_NOTCANCELLED;
877 }
878 
879 /*
880  * aio_error is implemented in the kernel level for compatibility
881  * purposes only.  For a user mode async implementation, it would be
882  * best to do it in a userland subroutine.
883  */
884 int
885 aio_error(struct proc *p, struct aio_error_args *uap, int *retval) {
886 	int activeflag, errorcode;
887 	struct aiocblist *cb;
888 	struct kaioinfo *ki;
889 	int jobref;
890 	int error, status;
891 
892 	ki = p->p_aioinfo;
893 	if (ki == NULL)
894 		return EINVAL;
895 
896 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
897 	if (jobref == -1)
898 		return EFAULT;
899 
900 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
901 		cb;
902 		cb = TAILQ_NEXT(cb, plist)) {
903 
904 		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
905 			retval[0] = cb->uaiocb._aiocb_private.error;
906 			return 0;
907 		}
908 	}
909 
910 	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue);
911 		cb;
912 		cb = TAILQ_NEXT(cb, plist)) {
913 
914 		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
915 			retval[0] = EINPROGRESS;
916 			return 0;
917 		}
918 	}
919 
920 	/*
921 	 * Hack for lio
922 	 */
923 	status = fuword(&uap->aiocbp->_aiocb_private.status);
924 	if (status == -1) {
925 		return fuword(&uap->aiocbp->_aiocb_private.error);
926 	}
927 	return EINVAL;
928 }
929 
930 int
931 aio_read(struct proc *p, struct aio_read_args *uap, int *retval) {
932 	struct filedesc *fdp;
933 	struct file *fp;
934 	struct uio auio;
935 	struct iovec aiov;
936 	unsigned int fd;
937 	int cnt;
938 	struct aiocb iocb;
939 	int error, pmodes;
940 
941 	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
942 	if ((pmodes & AIO_PMODE_SYNC) == 0) {
943 		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
944 	}
945 
946 	/*
947 	 * Get control block
948 	 */
949 	if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
950 		return error;
951 
952 	/*
953 	 * Get the fd info for process
954 	 */
955 	fdp = p->p_fd;
956 
957 	/*
958 	 * Range check file descriptor
959 	 */
960 	fd = iocb.aio_fildes;
961 	if (fd >= fdp->fd_nfiles)
962 		return EBADF;
963 	fp = fdp->fd_ofiles[fd];
964 	if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
965 		return EBADF;
966 	if (iocb.aio_offset == -1LL)
967 		return EINVAL;
968 
969 	auio.uio_resid = iocb.aio_nbytes;
970 	if (auio.uio_resid < 0)
971 		return (EINVAL);
972 
973 	/*
974 	 * Process sync simply -- queue async request.
975 	 */
976 	if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) {
977 		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
978 	}
979 
980 	aiov.iov_base = iocb.aio_buf;
981 	aiov.iov_len = iocb.aio_nbytes;
982 
983 	auio.uio_iov = &aiov;
984 	auio.uio_iovcnt = 1;
985 	auio.uio_offset = iocb.aio_offset;
986 	auio.uio_rw = UIO_READ;
987 	auio.uio_segflg = UIO_USERSPACE;
988 	auio.uio_procp = p;
989 
990 	cnt = iocb.aio_nbytes;
991 	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
992 	if (error &&
993 		(auio.uio_resid != cnt) &&
994 		(error == ERESTART || error == EINTR || error == EWOULDBLOCK))
995 			error = 0;
996 	cnt -= auio.uio_resid;
997 	*retval = cnt;
998 	return error;
999 }
1000 
1001 int
1002 aio_write(struct proc *p, struct aio_write_args *uap, int *retval) {
1003 	struct filedesc *fdp;
1004 	struct file *fp;
1005 	struct uio auio;
1006 	struct iovec aiov;
1007 	unsigned int fd;
1008 	int cnt;
1009 	struct aiocb iocb;
1010 	int error;
1011 	int pmodes;
1012 
1013 	/*
1014 	 * Process sync simply -- queue async request.
1015 	 */
1016 	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1017 	if ((pmodes & AIO_PMODE_SYNC) == 0) {
1018 		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE);
1019 	}
1020 
1021 	if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
1022 		return error;
1023 
1024 	/*
1025 	 * Get the fd info for process
1026 	 */
1027 	fdp = p->p_fd;
1028 
1029 	/*
1030 	 * Range check file descriptor
1031 	 */
1032 	fd = iocb.aio_fildes;
1033 	if (fd >= fdp->fd_nfiles)
1034 		return EBADF;
1035 	fp = fdp->fd_ofiles[fd];
1036 	if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
1037 		return EBADF;
1038 	if (iocb.aio_offset == -1LL)
1039 		return EINVAL;
1040 
1041 	aiov.iov_base = iocb.aio_buf;
1042 	aiov.iov_len = iocb.aio_nbytes;
1043 	auio.uio_iov = &aiov;
1044 	auio.uio_iovcnt = 1;
1045 	auio.uio_offset = iocb.aio_offset;
1046 
1047 	auio.uio_resid = iocb.aio_nbytes;
1048 	if (auio.uio_resid < 0)
1049 		return (EINVAL);
1050 
1051 	auio.uio_rw = UIO_WRITE;
1052 	auio.uio_segflg = UIO_USERSPACE;
1053 	auio.uio_procp = p;
1054 
1055 	cnt = iocb.aio_nbytes;
1056 	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
1057 	if (error) {
1058 		if (auio.uio_resid != cnt) {
1059 			if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
1060 				error = 0;
1061 			if (error == EPIPE)
1062 				psignal(p, SIGPIPE);
1063 		}
1064 	}
1065 	cnt -= auio.uio_resid;
1066 	*retval = cnt;
1067 	return error;
1068 }
1069 
1070 int
1071 lio_listio(struct proc *p, struct lio_listio_args *uap, int *retval) {
1072 	int cnt, nent, nentqueued;
1073 	struct aiocb *iocb, * const *cbptr;
1074 	struct aiocblist *cb;
1075 	struct kaioinfo *ki;
1076 	int error, runningcode;
1077 	int i;
1078 
1079 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1080 		return EINVAL;
1081 
1082 	nent = uap->nent;
1083 	if (nent > AIO_LISTIO_MAX)
1084 		return EINVAL;
1085 
1086 	if (p->p_aioinfo == NULL) {
1087 		aio_init_aioinfo(p);
1088 	}
1089 
1090 	if ((nent + num_queue_count) > max_queue_count)
1091 		return EAGAIN;
1092 
1093 	ki = p->p_aioinfo;
1094 	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1095 		return EAGAIN;
1096 
1097 /*
1098  * reserve resources, remember that we have to unwind part of them sometimes
1099  */
1100 	num_queue_count += nent;
1101 	ki->kaio_queue_count += nent;
1102 	nentqueued = 0;
1103 
1104 /*
1105  * get pointers to the list of I/O requests
1106 	iocbvec = malloc(uap->nent * sizeof(struct aiocb *), M_TEMP, M_WAITOK);
1107  */
1108 
1109 	cbptr = uap->acb_list;
1110 	for(i = 0; i < uap->nent; i++) {
1111 		iocb = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
1112 		error = aio_aqueue(p, iocb, 0);
1113 		if (error == 0)
1114 			nentqueued++;
1115 	}
1116 
1117 	if (nentqueued == 0)
1118 		return EIO;
1119 
1120 	runningcode = 0;
1121 	if (nentqueued != nent)
1122 		runningcode = EIO;
1123 
1124 	if (uap->mode == LIO_WAIT) {
1125 		while (1) {
1126 			for(i = 0; i < uap->nent; i++) {
1127 				int found;
1128 				int jobref, command, status;
1129 
1130 				iocb = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
1131 				command = fuword(&iocb->aio_lio_opcode);
1132 				if (command == LIO_NOP)
1133 					continue;
1134 
1135 				status = fuword(&iocb->_aiocb_private.status);
1136 				if (status == -1)
1137 					continue;
1138 				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
1139 
1140 				found = 0;
1141 				for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1142 					cb;
1143 					cb = TAILQ_NEXT(cb, plist)) {
1144 					if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1145 						found++;
1146 						break;
1147 					}
1148 				}
1149 				if (found == 0)
1150 					break;
1151 			}
1152 
1153 			if (i == uap->nent) {
1154 				return runningcode;
1155 			}
1156 
1157 			aio_marksuspend(p, 0, 0, 1);
1158 			error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0);
1159 			aio_marksuspend(p, 0, 0, 0);
1160 
1161 			if (error == EINTR) {
1162 				return EINTR;
1163 			} else if (error == EWOULDBLOCK) {
1164 				return EAGAIN;
1165 			}
1166 
1167 		}
1168 	}
1169 
1170 	return runningcode;
1171 }
1172