xref: /freebsd/sys/kern/vfs_aio.c (revision dd85920a4fc0036a1d5b1432f1f67fe479a20010)
1ee877a35SJohn Dyson /*
2ee877a35SJohn Dyson  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3ee877a35SJohn Dyson  *
4ee877a35SJohn Dyson  * Redistribution and use in source and binary forms, with or without
5ee877a35SJohn Dyson  * modification, are permitted provided that the following conditions
6ee877a35SJohn Dyson  * are met:
7ee877a35SJohn Dyson  * 1. Redistributions of source code must retain the above copyright
8ee877a35SJohn Dyson  *    notice, this list of conditions and the following disclaimer.
9ee877a35SJohn Dyson  * 2. John S. Dyson's name may not be used to endorse or promote products
10ee877a35SJohn Dyson  *    derived from this software without specific prior written permission.
11ee877a35SJohn Dyson  *
12ee877a35SJohn Dyson  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13ee877a35SJohn Dyson  * bad that happens because of using this software isn't the responsibility
14ee877a35SJohn Dyson  * of the author.  This software is distributed AS-IS.
15ee877a35SJohn Dyson  *
16c3aac50fSPeter Wemm  * $FreeBSD$
17ee877a35SJohn Dyson  */
18ee877a35SJohn Dyson 
19ee877a35SJohn Dyson /*
208a6472b7SPeter Dufault  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21ee877a35SJohn Dyson  */
22ee877a35SJohn Dyson 
23ee877a35SJohn Dyson #include <sys/param.h>
24ee877a35SJohn Dyson #include <sys/systm.h>
25a5c9bce7SBruce Evans #include <sys/buf.h>
26ee877a35SJohn Dyson #include <sys/sysproto.h>
27ee877a35SJohn Dyson #include <sys/filedesc.h>
28ee877a35SJohn Dyson #include <sys/kernel.h>
29ee877a35SJohn Dyson #include <sys/fcntl.h>
30ee877a35SJohn Dyson #include <sys/file.h>
31fdebd4f0SBruce Evans #include <sys/lock.h>
32ee877a35SJohn Dyson #include <sys/unistd.h>
33ee877a35SJohn Dyson #include <sys/proc.h>
342d2f8ae7SBruce Evans #include <sys/resourcevar.h>
35ee877a35SJohn Dyson #include <sys/signalvar.h>
36bfbbc4aaSJason Evans #include <sys/protosw.h>
37bfbbc4aaSJason Evans #include <sys/socketvar.h>
38a624e84fSJohn Dyson #include <sys/sysctl.h>
39fd3bf775SJohn Dyson #include <sys/vnode.h>
40fd3bf775SJohn Dyson #include <sys/conf.h>
41ee877a35SJohn Dyson 
42ee877a35SJohn Dyson #include <vm/vm.h>
43ee877a35SJohn Dyson #include <vm/vm_extern.h>
442244ea07SJohn Dyson #include <vm/pmap.h>
452244ea07SJohn Dyson #include <vm/vm_map.h>
46fd3bf775SJohn Dyson #include <vm/vm_zone.h>
47ee877a35SJohn Dyson #include <sys/aio.h>
485aaef07cSJohn Dyson 
4908637435SBruce Evans #include <machine/limits.h>
50dd85920aSJason Evans #include "opt_vfs_aio.h"
51ee877a35SJohn Dyson 
528c12612cSDoug Rabson static	long jobrefid;
532244ea07SJohn Dyson 
542244ea07SJohn Dyson #define JOBST_NULL		0x0
552244ea07SJohn Dyson #define	JOBST_JOBQPROC		0x1
562244ea07SJohn Dyson #define JOBST_JOBQGLOBAL	0x2
572244ea07SJohn Dyson #define JOBST_JOBRUNNING	0x3
582244ea07SJohn Dyson #define JOBST_JOBFINISHED	0x4
59fd3bf775SJohn Dyson #define	JOBST_JOBQBUF		0x5
60fd3bf775SJohn Dyson #define	JOBST_JOBBFINISHED	0x6
612244ea07SJohn Dyson 
6284af4da6SJohn Dyson #ifndef MAX_AIO_PER_PROC
632244ea07SJohn Dyson #define MAX_AIO_PER_PROC	32
6484af4da6SJohn Dyson #endif
6584af4da6SJohn Dyson 
6684af4da6SJohn Dyson #ifndef MAX_AIO_QUEUE_PER_PROC
672244ea07SJohn Dyson #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
6884af4da6SJohn Dyson #endif
6984af4da6SJohn Dyson 
7084af4da6SJohn Dyson #ifndef MAX_AIO_PROCS
71fd3bf775SJohn Dyson #define MAX_AIO_PROCS		32
7284af4da6SJohn Dyson #endif
7384af4da6SJohn Dyson 
7484af4da6SJohn Dyson #ifndef MAX_AIO_QUEUE
752244ea07SJohn Dyson #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
7684af4da6SJohn Dyson #endif
7784af4da6SJohn Dyson 
7884af4da6SJohn Dyson #ifndef TARGET_AIO_PROCS
79bfbbc4aaSJason Evans #define TARGET_AIO_PROCS	4
8084af4da6SJohn Dyson #endif
8184af4da6SJohn Dyson 
8284af4da6SJohn Dyson #ifndef MAX_BUF_AIO
8384af4da6SJohn Dyson #define MAX_BUF_AIO		16
8484af4da6SJohn Dyson #endif
8584af4da6SJohn Dyson 
8684af4da6SJohn Dyson #ifndef AIOD_TIMEOUT_DEFAULT
8784af4da6SJohn Dyson #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
8884af4da6SJohn Dyson #endif
8984af4da6SJohn Dyson 
9084af4da6SJohn Dyson #ifndef AIOD_LIFETIME_DEFAULT
9184af4da6SJohn Dyson #define AIOD_LIFETIME_DEFAULT	(30 * hz)
9284af4da6SJohn Dyson #endif
932244ea07SJohn Dyson 
94303b270bSEivind Eklund static int max_aio_procs = MAX_AIO_PROCS;
95303b270bSEivind Eklund static int num_aio_procs = 0;
96303b270bSEivind Eklund static int target_aio_procs = TARGET_AIO_PROCS;
97303b270bSEivind Eklund static int max_queue_count = MAX_AIO_QUEUE;
98303b270bSEivind Eklund static int num_queue_count = 0;
99303b270bSEivind Eklund static int num_buf_aio = 0;
100303b270bSEivind Eklund static int num_aio_resv_start = 0;
101303b270bSEivind Eklund static int aiod_timeout;
102303b270bSEivind Eklund static int aiod_lifetime;
103a624e84fSJohn Dyson 
104bfbbc4aaSJason Evans static int max_aio_per_proc = MAX_AIO_PER_PROC;
105bfbbc4aaSJason Evans static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
106303b270bSEivind Eklund static int max_buf_aio = MAX_BUF_AIO;
107a624e84fSJohn Dyson 
108a624e84fSJohn Dyson SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
109a624e84fSJohn Dyson 
110a624e84fSJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
111a624e84fSJohn Dyson 	CTLFLAG_RW, &max_aio_per_proc, 0, "");
112a624e84fSJohn Dyson 
113a624e84fSJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
114a624e84fSJohn Dyson 	CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
115a624e84fSJohn Dyson 
116a624e84fSJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
117a624e84fSJohn Dyson 	CTLFLAG_RW, &max_aio_procs, 0, "");
118a624e84fSJohn Dyson 
119a624e84fSJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
120a624e84fSJohn Dyson 	CTLFLAG_RD, &num_aio_procs, 0, "");
121a624e84fSJohn Dyson 
122a624e84fSJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
123a624e84fSJohn Dyson 	CTLFLAG_RD, &num_queue_count, 0, "");
124a624e84fSJohn Dyson 
125a624e84fSJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
126a624e84fSJohn Dyson 	CTLFLAG_RW, &max_queue_count, 0, "");
127a624e84fSJohn Dyson 
128a624e84fSJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
129a624e84fSJohn Dyson 	CTLFLAG_RW, &target_aio_procs, 0, "");
130a624e84fSJohn Dyson 
13184af4da6SJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
13284af4da6SJohn Dyson 	CTLFLAG_RW, &max_buf_aio, 0, "");
133fd3bf775SJohn Dyson 
134fd3bf775SJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
135fd3bf775SJohn Dyson 	CTLFLAG_RD, &num_buf_aio, 0, "");
136fd3bf775SJohn Dyson 
13784af4da6SJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
13884af4da6SJohn Dyson 	CTLFLAG_RW, &aiod_lifetime, 0, "");
13984af4da6SJohn Dyson 
14084af4da6SJohn Dyson SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
14184af4da6SJohn Dyson 	CTLFLAG_RW, &aiod_timeout, 0, "");
14284af4da6SJohn Dyson 
1432244ea07SJohn Dyson /*
1442244ea07SJohn Dyson  * AIO process info
1452244ea07SJohn Dyson  */
14684af4da6SJohn Dyson #define AIOP_FREE	0x1			/* proc on free queue */
14784af4da6SJohn Dyson #define AIOP_SCHED	0x2			/* proc explicitly scheduled */
14884af4da6SJohn Dyson 
1492244ea07SJohn Dyson struct aioproclist {
1502244ea07SJohn Dyson 	int aioprocflags;			/* AIO proc flags */
1512244ea07SJohn Dyson 	TAILQ_ENTRY(aioproclist) list;		/* List of processes */
1522244ea07SJohn Dyson 	struct proc *aioproc;			/* The AIO thread */
1532244ea07SJohn Dyson 	TAILQ_HEAD (,aiocblist) jobtorun;	/* suggested job to run */
1542244ea07SJohn Dyson };
1552244ea07SJohn Dyson 
15684af4da6SJohn Dyson /*
15784af4da6SJohn Dyson  * data-structure for lio signal management
15884af4da6SJohn Dyson  */
15984af4da6SJohn Dyson struct aio_liojob {
16084af4da6SJohn Dyson 	int	lioj_flags;
16184af4da6SJohn Dyson 	int	lioj_buffer_count;
16284af4da6SJohn Dyson 	int	lioj_buffer_finished_count;
16384af4da6SJohn Dyson 	int	lioj_queue_count;
16484af4da6SJohn Dyson 	int	lioj_queue_finished_count;
16584af4da6SJohn Dyson 	struct	sigevent lioj_signal;	/* signal on all I/O done */
16684af4da6SJohn Dyson 	TAILQ_ENTRY	(aio_liojob) lioj_list;
16784af4da6SJohn Dyson 	struct	kaioinfo *lioj_ki;
16884af4da6SJohn Dyson };
16984af4da6SJohn Dyson #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
17084af4da6SJohn Dyson #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
17184af4da6SJohn Dyson 
17284af4da6SJohn Dyson /*
17384af4da6SJohn Dyson  * per process aio data structure
17484af4da6SJohn Dyson  */
1752244ea07SJohn Dyson struct kaioinfo {
176fd3bf775SJohn Dyson 	int	kaio_flags;		/* per process kaio flags */
1772244ea07SJohn Dyson 	int	kaio_maxactive_count;	/* maximum number of AIOs */
1782244ea07SJohn Dyson 	int	kaio_active_count;	/* number of currently used AIOs */
1792244ea07SJohn Dyson 	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
1802244ea07SJohn Dyson 	int	kaio_queue_count;	/* size of AIO queue */
181fd3bf775SJohn Dyson 	int	kaio_ballowed_count;	/* maximum number of buffers */
18284af4da6SJohn Dyson 	int	kaio_queue_finished_count; /* number of daemon jobs finished */
183fd3bf775SJohn Dyson 	int	kaio_buffer_count;	/* number of physio buffers */
18484af4da6SJohn Dyson 	int	kaio_buffer_finished_count; /* count of I/O done */
18584af4da6SJohn Dyson 	struct 	proc *kaio_p;		/* process that uses this kaio block */
18684af4da6SJohn Dyson 	TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */
1872244ea07SJohn Dyson 	TAILQ_HEAD (,aiocblist)	kaio_jobqueue;	/* job queue for process */
1882244ea07SJohn Dyson 	TAILQ_HEAD (,aiocblist)	kaio_jobdone;	/* done queue for process */
189fd3bf775SJohn Dyson 	TAILQ_HEAD (,aiocblist)	kaio_bufqueue;	/* buffer job queue for process */
190fd3bf775SJohn Dyson 	TAILQ_HEAD (,aiocblist)	kaio_bufdone;	/* buffer done queue for process */
191bfbbc4aaSJason Evans 	TAILQ_HEAD (,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
1922244ea07SJohn Dyson };
1932244ea07SJohn Dyson 
19484af4da6SJohn Dyson #define KAIO_RUNDOWN	0x1	/* process is being run down */
195bfbbc4aaSJason Evans #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
196fd3bf775SJohn Dyson 
197303b270bSEivind Eklund static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc;
198303b270bSEivind Eklund static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
199303b270bSEivind Eklund static TAILQ_HEAD(,aiocblist) aio_bufjobs;		/* Phys I/O job list */
200303b270bSEivind Eklund static TAILQ_HEAD(,aiocblist) aio_freejobs;		/* Pool of free jobs */
2012244ea07SJohn Dyson 
202fd3bf775SJohn Dyson static void	aio_init_aioinfo(struct proc *p);
203fd3bf775SJohn Dyson static void	aio_onceonly(void *);
204fd3bf775SJohn Dyson static int	aio_free_entry(struct aiocblist *aiocbe);
205fd3bf775SJohn Dyson static void	aio_process(struct aiocblist *aiocbe);
2062244ea07SJohn Dyson static int	aio_newproc(void);
2072244ea07SJohn Dyson static int	aio_aqueue(struct proc *p, struct aiocb *job, int type);
208fd3bf775SJohn Dyson static void	aio_physwakeup(struct buf *bp);
209fd3bf775SJohn Dyson static int	aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
210fd3bf775SJohn Dyson static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
2119c8b8baaSPeter Wemm static void	aio_daemon(void *uproc);
2122244ea07SJohn Dyson 
2132244ea07SJohn Dyson SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
2142244ea07SJohn Dyson 
215bfbbc4aaSJason Evans static vm_zone_t kaio_zone = 0, aiop_zone = 0, aiocb_zone = 0, aiol_zone = 0;
216bfbbc4aaSJason Evans static vm_zone_t aiolio_zone = 0;
217fd3bf775SJohn Dyson 
218fd3bf775SJohn Dyson /*
2192244ea07SJohn Dyson  * Startup initialization
2202244ea07SJohn Dyson  */
2212244ea07SJohn Dyson void
222fd3bf775SJohn Dyson aio_onceonly(void *na)
223fd3bf775SJohn Dyson {
2242244ea07SJohn Dyson 	TAILQ_INIT(&aio_freeproc);
2252244ea07SJohn Dyson 	TAILQ_INIT(&aio_activeproc);
2262244ea07SJohn Dyson 	TAILQ_INIT(&aio_jobs);
227fd3bf775SJohn Dyson 	TAILQ_INIT(&aio_bufjobs);
2282244ea07SJohn Dyson 	TAILQ_INIT(&aio_freejobs);
229fd3bf775SJohn Dyson 	kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
230fd3bf775SJohn Dyson 	aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
231fd3bf775SJohn Dyson 	aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
232fd3bf775SJohn Dyson 	aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
233bfbbc4aaSJason Evans 	aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct
234bfbbc4aaSJason Evans 	    aio_liojob), 0, 0, 1);
23584af4da6SJohn Dyson 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
23684af4da6SJohn Dyson 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
237fd3bf775SJohn Dyson 	jobrefid = 1;
2382244ea07SJohn Dyson }
2392244ea07SJohn Dyson 
2402244ea07SJohn Dyson /*
241bfbbc4aaSJason Evans  * Init the per-process aioinfo structure.  The aioinfo limits are set
242bfbbc4aaSJason Evans  * per-process for user limit (resource) management.
2432244ea07SJohn Dyson  */
2442244ea07SJohn Dyson void
245fd3bf775SJohn Dyson aio_init_aioinfo(struct proc *p)
246fd3bf775SJohn Dyson {
2472244ea07SJohn Dyson 	struct kaioinfo *ki;
2482244ea07SJohn Dyson 	if (p->p_aioinfo == NULL) {
249fd3bf775SJohn Dyson 		ki = zalloc(kaio_zone);
2502244ea07SJohn Dyson 		p->p_aioinfo = ki;
25184af4da6SJohn Dyson 		ki->kaio_flags = 0;
252a624e84fSJohn Dyson 		ki->kaio_maxactive_count = max_aio_per_proc;
2532244ea07SJohn Dyson 		ki->kaio_active_count = 0;
254a624e84fSJohn Dyson 		ki->kaio_qallowed_count = max_aio_queue_per_proc;
2552244ea07SJohn Dyson 		ki->kaio_queue_count = 0;
25684af4da6SJohn Dyson 		ki->kaio_ballowed_count = max_buf_aio;
257fd3bf775SJohn Dyson 		ki->kaio_buffer_count = 0;
25884af4da6SJohn Dyson 		ki->kaio_buffer_finished_count = 0;
25984af4da6SJohn Dyson 		ki->kaio_p = p;
2602244ea07SJohn Dyson 		TAILQ_INIT(&ki->kaio_jobdone);
2612244ea07SJohn Dyson 		TAILQ_INIT(&ki->kaio_jobqueue);
262fd3bf775SJohn Dyson 		TAILQ_INIT(&ki->kaio_bufdone);
263fd3bf775SJohn Dyson 		TAILQ_INIT(&ki->kaio_bufqueue);
26484af4da6SJohn Dyson 		TAILQ_INIT(&ki->kaio_liojoblist);
265bfbbc4aaSJason Evans 		TAILQ_INIT(&ki->kaio_sockqueue);
2662244ea07SJohn Dyson 	}
267bfbbc4aaSJason Evans 
268bfbbc4aaSJason Evans 	while (num_aio_procs < target_aio_procs)
269bfbbc4aaSJason Evans 		aio_newproc();
2702244ea07SJohn Dyson }
2712244ea07SJohn Dyson 
2722244ea07SJohn Dyson /*
273bfbbc4aaSJason Evans  * Free a job entry.  Wait for completion if it is currently active, but don't
274bfbbc4aaSJason Evans  * delay forever.  If we delay, we return a flag that says that we have to
275bfbbc4aaSJason Evans  * restart the queue scan.
2762244ea07SJohn Dyson  */
2772244ea07SJohn Dyson int
278fd3bf775SJohn Dyson aio_free_entry(struct aiocblist *aiocbe)
279fd3bf775SJohn Dyson {
2802244ea07SJohn Dyson 	struct kaioinfo *ki;
2812244ea07SJohn Dyson 	struct aioproclist *aiop;
28284af4da6SJohn Dyson 	struct aio_liojob *lj;
2832244ea07SJohn Dyson 	struct proc *p;
284fd3bf775SJohn Dyson 	int error;
28511783b14SJohn Dyson 	int s;
2862244ea07SJohn Dyson 
2872244ea07SJohn Dyson 	if (aiocbe->jobstate == JOBST_NULL)
2882244ea07SJohn Dyson 		panic("aio_free_entry: freeing already free job");
2892244ea07SJohn Dyson 
2902244ea07SJohn Dyson 	p = aiocbe->userproc;
2912244ea07SJohn Dyson 	ki = p->p_aioinfo;
29284af4da6SJohn Dyson 	lj = aiocbe->lio;
2932244ea07SJohn Dyson 	if (ki == NULL)
2942244ea07SJohn Dyson 		panic("aio_free_entry: missing p->p_aioinfo");
2952244ea07SJohn Dyson 
2962244ea07SJohn Dyson 	if (aiocbe->jobstate == JOBST_JOBRUNNING) {
2972244ea07SJohn Dyson 		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
2982244ea07SJohn Dyson 			return 0;
2992244ea07SJohn Dyson 		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
300a624e84fSJohn Dyson 		tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0);
3012244ea07SJohn Dyson 	}
3022244ea07SJohn Dyson 	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
3032244ea07SJohn Dyson 
304fd3bf775SJohn Dyson 	if (aiocbe->bp == NULL) {
3052244ea07SJohn Dyson 		if (ki->kaio_queue_count <= 0)
3062244ea07SJohn Dyson 			panic("aio_free_entry: process queue size <= 0");
3072244ea07SJohn Dyson 		if (num_queue_count <= 0)
3082244ea07SJohn Dyson 			panic("aio_free_entry: system wide queue size <= 0");
3092244ea07SJohn Dyson 
31084af4da6SJohn Dyson 		if (lj) {
31184af4da6SJohn Dyson 			lj->lioj_queue_count--;
31284af4da6SJohn Dyson 			if (aiocbe->jobflags & AIOCBLIST_DONE)
31384af4da6SJohn Dyson 				lj->lioj_queue_finished_count--;
31484af4da6SJohn Dyson 		}
31584af4da6SJohn Dyson 		ki->kaio_queue_count--;
31684af4da6SJohn Dyson 		if (aiocbe->jobflags & AIOCBLIST_DONE)
31784af4da6SJohn Dyson 			ki->kaio_queue_finished_count--;
31884af4da6SJohn Dyson 		num_queue_count--;
319fd3bf775SJohn Dyson 	} else {
32084af4da6SJohn Dyson 		if (lj) {
32184af4da6SJohn Dyson 			lj->lioj_buffer_count--;
32284af4da6SJohn Dyson 			if (aiocbe->jobflags & AIOCBLIST_DONE)
32384af4da6SJohn Dyson 				lj->lioj_buffer_finished_count--;
32484af4da6SJohn Dyson 		}
32584af4da6SJohn Dyson 		if (aiocbe->jobflags & AIOCBLIST_DONE)
32684af4da6SJohn Dyson 			ki->kaio_buffer_finished_count--;
32784af4da6SJohn Dyson 		ki->kaio_buffer_count--;
32884af4da6SJohn Dyson 		num_buf_aio--;
329fd3bf775SJohn Dyson 	}
330fd3bf775SJohn Dyson 
331bfbbc4aaSJason Evans 	if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
332bfbbc4aaSJason Evans 	    && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
333fd3bf775SJohn Dyson 		ki->kaio_flags &= ~KAIO_WAKEUP;
334fd3bf775SJohn Dyson 		wakeup(p);
335fd3bf775SJohn Dyson 	}
336fd3bf775SJohn Dyson 
337fd3bf775SJohn Dyson 	if (aiocbe->jobstate == JOBST_JOBQBUF) {
338fd3bf775SJohn Dyson 		if ((error = aio_fphysio(p, aiocbe, 1)) != 0)
339fd3bf775SJohn Dyson 			return error;
340fd3bf775SJohn Dyson 		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
341fd3bf775SJohn Dyson 			panic("aio_free_entry: invalid physio finish-up state");
34211783b14SJohn Dyson 		s = splbio();
343fd3bf775SJohn Dyson 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
34411783b14SJohn Dyson 		splx(s);
345fd3bf775SJohn Dyson 	} else if (aiocbe->jobstate == JOBST_JOBQPROC) {
3462244ea07SJohn Dyson 		aiop = aiocbe->jobaioproc;
3472244ea07SJohn Dyson 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
348bfbbc4aaSJason Evans 	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL)
3492244ea07SJohn Dyson 		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
350bfbbc4aaSJason Evans 	else if (aiocbe->jobstate == JOBST_JOBFINISHED)
3512244ea07SJohn Dyson 		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
352bfbbc4aaSJason Evans 	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
35311783b14SJohn Dyson 		s = splbio();
354fd3bf775SJohn Dyson 		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
35511783b14SJohn Dyson 		splx(s);
35684af4da6SJohn Dyson 		if (aiocbe->bp) {
35784af4da6SJohn Dyson 			vunmapbuf(aiocbe->bp);
3581c7c3c6aSMatthew Dillon 			relpbuf(aiocbe->bp, NULL);
35984af4da6SJohn Dyson 			aiocbe->bp = NULL;
36084af4da6SJohn Dyson 		}
36184af4da6SJohn Dyson 	}
36284af4da6SJohn Dyson 	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
36384af4da6SJohn Dyson 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
36484af4da6SJohn Dyson 		zfree(aiolio_zone, lj);
3652244ea07SJohn Dyson 	}
3662244ea07SJohn Dyson 	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
3672244ea07SJohn Dyson 	aiocbe->jobstate = JOBST_NULL;
3682244ea07SJohn Dyson 	return 0;
3692244ea07SJohn Dyson }
3702244ea07SJohn Dyson 
3712244ea07SJohn Dyson /*
3722244ea07SJohn Dyson  * Rundown the jobs for a given process.
3732244ea07SJohn Dyson  */
3742244ea07SJohn Dyson void
375fd3bf775SJohn Dyson aio_proc_rundown(struct proc *p)
376fd3bf775SJohn Dyson {
37784af4da6SJohn Dyson 	int s;
3782244ea07SJohn Dyson 	struct kaioinfo *ki;
37984af4da6SJohn Dyson 	struct aio_liojob *lj, *ljn;
3802244ea07SJohn Dyson 	struct aiocblist *aiocbe, *aiocbn;
381bfbbc4aaSJason Evans 	struct file *fp;
382bfbbc4aaSJason Evans 	struct filedesc *fdp;
383bfbbc4aaSJason Evans 	struct socket *so;
3842244ea07SJohn Dyson 
3852244ea07SJohn Dyson 	ki = p->p_aioinfo;
3862244ea07SJohn Dyson 	if (ki == NULL)
3872244ea07SJohn Dyson 		return;
3882244ea07SJohn Dyson 
38984af4da6SJohn Dyson 	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
390bfbbc4aaSJason Evans 	while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
391bfbbc4aaSJason Evans 	    ki->kaio_buffer_finished_count)) {
392fd3bf775SJohn Dyson 		ki->kaio_flags |= KAIO_RUNDOWN;
39384af4da6SJohn Dyson 		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
394a624e84fSJohn Dyson 			break;
395a624e84fSJohn Dyson 	}
396a624e84fSJohn Dyson 
397bfbbc4aaSJason Evans 	/*
398bfbbc4aaSJason Evans 	 * Move any aio ops that are waiting on socket I/O to the normal job
399bfbbc4aaSJason Evans 	 * queues so they are cleaned up with any others.
400bfbbc4aaSJason Evans 	 */
401bfbbc4aaSJason Evans 	fdp = p->p_fd;
402bfbbc4aaSJason Evans 
403bfbbc4aaSJason Evans 	s = splnet();
404bfbbc4aaSJason Evans 	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
405bfbbc4aaSJason Evans 	    aiocbn) {
406bfbbc4aaSJason Evans 		aiocbn = TAILQ_NEXT(aiocbe, plist);
407bfbbc4aaSJason Evans 		fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes];
408bfbbc4aaSJason Evans 
409bfbbc4aaSJason Evans 		/*
410bfbbc4aaSJason Evans 		 * Under some circumstances, the aio_fildes and the file
411bfbbc4aaSJason Evans 		 * structure don't match.  This would leave aiocbe's in the
412bfbbc4aaSJason Evans 		 * TAILQ associated with the socket and cause a panic later.
413bfbbc4aaSJason Evans 		 *
414bfbbc4aaSJason Evans 		 * Detect and fix.
415bfbbc4aaSJason Evans 		 */
416bfbbc4aaSJason Evans 		if ((fp == NULL) || (fp != aiocbe->fd_file))
417bfbbc4aaSJason Evans 			fp = aiocbe->fd_file;
418bfbbc4aaSJason Evans 		if (fp) {
419bfbbc4aaSJason Evans 			so = (struct socket *)fp->f_data;
420bfbbc4aaSJason Evans 			TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
421bfbbc4aaSJason Evans 			if (TAILQ_EMPTY(&so->so_aiojobq)) {
422bfbbc4aaSJason Evans 				so->so_snd.sb_flags &= ~SB_AIO;
423bfbbc4aaSJason Evans 				so->so_rcv.sb_flags &= ~SB_AIO;
424bfbbc4aaSJason Evans 			}
425bfbbc4aaSJason Evans 		}
426bfbbc4aaSJason Evans 		TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
427bfbbc4aaSJason Evans 		TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
428bfbbc4aaSJason Evans 		TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
429bfbbc4aaSJason Evans 	}
430bfbbc4aaSJason Evans 	splx(s);
431bfbbc4aaSJason Evans 
4322244ea07SJohn Dyson restart1:
433bfbbc4aaSJason Evans 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
4342244ea07SJohn Dyson 		aiocbn = TAILQ_NEXT(aiocbe, plist);
4352244ea07SJohn Dyson 		if (aio_free_entry(aiocbe))
4362244ea07SJohn Dyson 			goto restart1;
4372244ea07SJohn Dyson 	}
4382244ea07SJohn Dyson 
4392244ea07SJohn Dyson restart2:
440bfbbc4aaSJason Evans 	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
441bfbbc4aaSJason Evans 	    aiocbn) {
4422244ea07SJohn Dyson 		aiocbn = TAILQ_NEXT(aiocbe, plist);
4432244ea07SJohn Dyson 		if (aio_free_entry(aiocbe))
4442244ea07SJohn Dyson 			goto restart2;
4452244ea07SJohn Dyson 	}
44684af4da6SJohn Dyson 
447c5efdcbdSJohn Dyson /*
448bfbbc4aaSJason Evans  * Note the use of lots of splbio here, trying to avoid splbio for long chains
449bfbbc4aaSJason Evans  * of I/O.  Probably unnecessary.
450c5efdcbdSJohn Dyson  */
45184af4da6SJohn Dyson restart3:
45284af4da6SJohn Dyson 	s = splbio();
45384af4da6SJohn Dyson 	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
45484af4da6SJohn Dyson 		ki->kaio_flags |= KAIO_WAKEUP;
45584af4da6SJohn Dyson 		tsleep(p, PRIBIO, "aioprn", 0);
45684af4da6SJohn Dyson 		splx(s);
45784af4da6SJohn Dyson 		goto restart3;
45884af4da6SJohn Dyson 	}
459c5efdcbdSJohn Dyson 	splx(s);
46084af4da6SJohn Dyson 
46184af4da6SJohn Dyson restart4:
46284af4da6SJohn Dyson 	s = splbio();
463bfbbc4aaSJason Evans 	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
46484af4da6SJohn Dyson 		aiocbn = TAILQ_NEXT(aiocbe, plist);
46584af4da6SJohn Dyson 		if (aio_free_entry(aiocbe)) {
46684af4da6SJohn Dyson 			splx(s);
46784af4da6SJohn Dyson 			goto restart4;
46884af4da6SJohn Dyson 		}
46984af4da6SJohn Dyson 	}
47084af4da6SJohn Dyson 	splx(s);
47184af4da6SJohn Dyson 
472bfbbc4aaSJason Evans 	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
47384af4da6SJohn Dyson 		ljn = TAILQ_NEXT(lj, lioj_list);
474bfbbc4aaSJason Evans 		if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
475bfbbc4aaSJason Evans 		    0)) {
47684af4da6SJohn Dyson 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
47784af4da6SJohn Dyson 			zfree(aiolio_zone, lj);
478f4f0ecefSJohn Dyson 		} else {
479bfbbc4aaSJason Evans #ifdef DIAGNOSTIC
480bfbbc4aaSJason Evans 			printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
481bfbbc4aaSJason Evans 			    "QF:%d\n", lj->lioj_buffer_count,
482bfbbc4aaSJason Evans 			    lj->lioj_buffer_finished_count,
483bfbbc4aaSJason Evans 			    lj->lioj_queue_count,
484bfbbc4aaSJason Evans 			    lj->lioj_queue_finished_count);
48511783b14SJohn Dyson #endif
48684af4da6SJohn Dyson 		}
487f4f0ecefSJohn Dyson 	}
48884af4da6SJohn Dyson 
489fd3bf775SJohn Dyson 	zfree(kaio_zone, ki);
490a624e84fSJohn Dyson 	p->p_aioinfo = NULL;
4912244ea07SJohn Dyson }
4922244ea07SJohn Dyson 
4932244ea07SJohn Dyson /*
494bfbbc4aaSJason Evans  * Select a job to run (called by an AIO daemon).
4952244ea07SJohn Dyson  */
4962244ea07SJohn Dyson static struct aiocblist *
497fd3bf775SJohn Dyson aio_selectjob(struct aioproclist *aiop)
498fd3bf775SJohn Dyson {
499bfbbc4aaSJason Evans 	int s;
5002244ea07SJohn Dyson 	struct aiocblist *aiocbe;
501bfbbc4aaSJason Evans 	struct kaioinfo *ki;
502bfbbc4aaSJason Evans 	struct proc *userp;
5032244ea07SJohn Dyson 
5042244ea07SJohn Dyson 	aiocbe = TAILQ_FIRST(&aiop->jobtorun);
5052244ea07SJohn Dyson 	if (aiocbe) {
5062244ea07SJohn Dyson 		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
5072244ea07SJohn Dyson 		return aiocbe;
5082244ea07SJohn Dyson 	}
5092244ea07SJohn Dyson 
510bfbbc4aaSJason Evans 	s = splnet();
511bfbbc4aaSJason Evans 	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
512bfbbc4aaSJason Evans 	    TAILQ_NEXT(aiocbe, list)) {
5132244ea07SJohn Dyson 		userp = aiocbe->userproc;
5142244ea07SJohn Dyson 		ki = userp->p_aioinfo;
5152244ea07SJohn Dyson 
5162244ea07SJohn Dyson 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
5172244ea07SJohn Dyson 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
518bfbbc4aaSJason Evans 			splx(s);
5192244ea07SJohn Dyson 			return aiocbe;
5202244ea07SJohn Dyson 		}
5212244ea07SJohn Dyson 	}
522bfbbc4aaSJason Evans 	splx(s);
5232244ea07SJohn Dyson 
5242244ea07SJohn Dyson 	return NULL;
5252244ea07SJohn Dyson }
5262244ea07SJohn Dyson 
5272244ea07SJohn Dyson /*
528bfbbc4aaSJason Evans  * The AIO processing activity.  This is the code that does the I/O request for
529bfbbc4aaSJason Evans  * the non-physio version of the operations.  The normal vn operations are used,
530bfbbc4aaSJason Evans  * and this code should work in all instances for every type of file, including
531bfbbc4aaSJason Evans  * pipes, sockets, fifos, and regular files.
5322244ea07SJohn Dyson  */
5332244ea07SJohn Dyson void
534fd3bf775SJohn Dyson aio_process(struct aiocblist *aiocbe)
535fd3bf775SJohn Dyson {
5362244ea07SJohn Dyson 	struct filedesc *fdp;
537fd3bf775SJohn Dyson 	struct proc *userp, *mycp;
5382244ea07SJohn Dyson 	struct aiocb *cb;
5392244ea07SJohn Dyson 	struct file *fp;
5402244ea07SJohn Dyson 	struct uio auio;
5412244ea07SJohn Dyson 	struct iovec aiov;
5422244ea07SJohn Dyson 	unsigned int fd;
5432244ea07SJohn Dyson 	int cnt;
5442244ea07SJohn Dyson 	int error;
545a624e84fSJohn Dyson 	off_t offset;
546fd3bf775SJohn Dyson 	int oublock_st, oublock_end;
547fd3bf775SJohn Dyson 	int inblock_st, inblock_end;
5482244ea07SJohn Dyson 
5492244ea07SJohn Dyson 	userp = aiocbe->userproc;
5502244ea07SJohn Dyson 	cb = &aiocbe->uaiocb;
5512244ea07SJohn Dyson 
552fd3bf775SJohn Dyson 	mycp = curproc;
553fd3bf775SJohn Dyson 
554fd3bf775SJohn Dyson 	fdp = mycp->p_fd;
5552244ea07SJohn Dyson 	fd = cb->aio_fildes;
5562244ea07SJohn Dyson 	fp = fdp->fd_ofiles[fd];
5572244ea07SJohn Dyson 
558bfbbc4aaSJason Evans 	if ((fp == NULL) || (fp != aiocbe->fd_file)) {
559bfbbc4aaSJason Evans 		cb->_aiocb_private.error = EBADF;
560bfbbc4aaSJason Evans 		cb->_aiocb_private.status = -1;
561bfbbc4aaSJason Evans 		return;
562bfbbc4aaSJason Evans 	}
563bfbbc4aaSJason Evans 
56464889941SJohn Dyson 	aiov.iov_base = (void *)cb->aio_buf;
5652244ea07SJohn Dyson 	aiov.iov_len = cb->aio_nbytes;
5662244ea07SJohn Dyson 
5672244ea07SJohn Dyson 	auio.uio_iov = &aiov;
5682244ea07SJohn Dyson 	auio.uio_iovcnt = 1;
569a624e84fSJohn Dyson 	auio.uio_offset = offset = cb->aio_offset;
5702244ea07SJohn Dyson 	auio.uio_resid = cb->aio_nbytes;
5712244ea07SJohn Dyson 	cnt = cb->aio_nbytes;
5722244ea07SJohn Dyson 	auio.uio_segflg = UIO_USERSPACE;
573fd3bf775SJohn Dyson 	auio.uio_procp = mycp;
5742244ea07SJohn Dyson 
575fd3bf775SJohn Dyson 	inblock_st = mycp->p_stats->p_ru.ru_inblock;
576fd3bf775SJohn Dyson 	oublock_st = mycp->p_stats->p_ru.ru_oublock;
5772244ea07SJohn Dyson 	if (cb->aio_lio_opcode == LIO_READ) {
5782244ea07SJohn Dyson 		auio.uio_rw = UIO_READ;
57913ccadd4SBrian Feldman 		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
5802244ea07SJohn Dyson 	} else {
5812244ea07SJohn Dyson 		auio.uio_rw = UIO_WRITE;
58213ccadd4SBrian Feldman 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
5832244ea07SJohn Dyson 	}
584fd3bf775SJohn Dyson 	inblock_end = mycp->p_stats->p_ru.ru_inblock;
585fd3bf775SJohn Dyson 	oublock_end = mycp->p_stats->p_ru.ru_oublock;
586fd3bf775SJohn Dyson 
587fd3bf775SJohn Dyson 	aiocbe->inputcharge = inblock_end - inblock_st;
588fd3bf775SJohn Dyson 	aiocbe->outputcharge = oublock_end - oublock_st;
5892244ea07SJohn Dyson 
590bfbbc4aaSJason Evans 	if ((error) && (auio.uio_resid != cnt)) {
5912244ea07SJohn Dyson 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
5922244ea07SJohn Dyson 			error = 0;
5932244ea07SJohn Dyson 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
5942244ea07SJohn Dyson 			psignal(userp, SIGPIPE);
5952244ea07SJohn Dyson 	}
5962244ea07SJohn Dyson 
5972244ea07SJohn Dyson 	cnt -= auio.uio_resid;
5982244ea07SJohn Dyson 	cb->_aiocb_private.error = error;
5992244ea07SJohn Dyson 	cb->_aiocb_private.status = cnt;
6002244ea07SJohn Dyson 
6012244ea07SJohn Dyson 	return;
6022244ea07SJohn Dyson }
6032244ea07SJohn Dyson 
6042244ea07SJohn Dyson /*
60584af4da6SJohn Dyson  * The AIO daemon, most of the actual work is done in aio_process,
60684af4da6SJohn Dyson  * but the setup (and address space mgmt) is done in this routine.
6072244ea07SJohn Dyson  */
6082244ea07SJohn Dyson static void
6099c8b8baaSPeter Wemm aio_daemon(void *uproc)
6102244ea07SJohn Dyson {
61184af4da6SJohn Dyson 	int s;
612bfbbc4aaSJason Evans 	struct aio_liojob *lj;
613bfbbc4aaSJason Evans 	struct aiocb *cb;
614bfbbc4aaSJason Evans 	struct aiocblist *aiocbe;
6152244ea07SJohn Dyson 	struct aioproclist *aiop;
616bfbbc4aaSJason Evans 	struct kaioinfo *ki;
617bfbbc4aaSJason Evans 	struct proc *curcp, *mycp, *userp;
618bfbbc4aaSJason Evans 	struct vmspace *myvm, *tmpvm;
6192244ea07SJohn Dyson 
6202244ea07SJohn Dyson 	/*
621fd3bf775SJohn Dyson 	 * Local copies of curproc (cp) and vmspace (myvm)
6222244ea07SJohn Dyson 	 */
623fd3bf775SJohn Dyson 	mycp = curproc;
624fd3bf775SJohn Dyson 	myvm = mycp->p_vmspace;
625fd3bf775SJohn Dyson 
626fd3bf775SJohn Dyson 	if (mycp->p_textvp) {
627fd3bf775SJohn Dyson 		vrele(mycp->p_textvp);
628fd3bf775SJohn Dyson 		mycp->p_textvp = NULL;
629fd3bf775SJohn Dyson 	}
630fd3bf775SJohn Dyson 
631fd3bf775SJohn Dyson 	/*
632bfbbc4aaSJason Evans 	 * Allocate and ready the aio control info.  There is one aiop structure
633bfbbc4aaSJason Evans 	 * per daemon.
634fd3bf775SJohn Dyson 	 */
635fd3bf775SJohn Dyson 	aiop = zalloc(aiop_zone);
636fd3bf775SJohn Dyson 	aiop->aioproc = mycp;
6372244ea07SJohn Dyson 	aiop->aioprocflags |= AIOP_FREE;
6382244ea07SJohn Dyson 	TAILQ_INIT(&aiop->jobtorun);
6392244ea07SJohn Dyson 
640bfbbc4aaSJason Evans 	s = splnet();
641bfbbc4aaSJason Evans 
6422244ea07SJohn Dyson 	/*
643bfbbc4aaSJason Evans 	 * Place thread (lightweight process) onto the AIO free thread list.
6442244ea07SJohn Dyson 	 */
645fd3bf775SJohn Dyson 	if (TAILQ_EMPTY(&aio_freeproc))
646fd3bf775SJohn Dyson 		wakeup(&aio_freeproc);
647fd3bf775SJohn Dyson 	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
6482244ea07SJohn Dyson 
649bfbbc4aaSJason Evans 	splx(s);
650bfbbc4aaSJason Evans 
651bfbbc4aaSJason Evans 	/* Make up a name for the daemon. */
652fd3bf775SJohn Dyson 	strcpy(mycp->p_comm, "aiod");
6532244ea07SJohn Dyson 
6542244ea07SJohn Dyson 	/*
655fd3bf775SJohn Dyson 	 * Get rid of our current filedescriptors.  AIOD's don't need any
656fd3bf775SJohn Dyson 	 * filedescriptors, except as temporarily inherited from the client.
657bfbbc4aaSJason Evans 	 * Credentials are also cloned, and made equivalent to "root".
6582244ea07SJohn Dyson 	 */
659fd3bf775SJohn Dyson 	fdfree(mycp);
660fd3bf775SJohn Dyson 	mycp->p_fd = NULL;
661fd3bf775SJohn Dyson 	mycp->p_ucred = crcopy(mycp->p_ucred);
662fd3bf775SJohn Dyson 	mycp->p_ucred->cr_uid = 0;
663fd3bf775SJohn Dyson 	mycp->p_ucred->cr_ngroups = 1;
664fd3bf775SJohn Dyson 	mycp->p_ucred->cr_groups[0] = 1;
665fd3bf775SJohn Dyson 
666bfbbc4aaSJason Evans 	/* The daemon resides in its own pgrp. */
667fd3bf775SJohn Dyson 	enterpgrp(mycp, mycp->p_pid, 1);
668fd3bf775SJohn Dyson 
669bfbbc4aaSJason Evans 	/* Mark special process type. */
670fd3bf775SJohn Dyson 	mycp->p_flag |= P_SYSTEM | P_KTHREADP;
6712244ea07SJohn Dyson 
672fd3bf775SJohn Dyson 	/*
673fd3bf775SJohn Dyson 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
674fd3bf775SJohn Dyson 	 * creating to many daemons.)
675fd3bf775SJohn Dyson 	 */
676fd3bf775SJohn Dyson 	wakeup(mycp);
6772244ea07SJohn Dyson 
678bfbbc4aaSJason Evans 	for (;;) {
679fd3bf775SJohn Dyson 		/*
680fd3bf775SJohn Dyson 		 * curcp is the current daemon process context.
681fd3bf775SJohn Dyson 		 * userp is the current user process context.
682fd3bf775SJohn Dyson 		 */
683fd3bf775SJohn Dyson 		curcp = mycp;
684c4860686SJohn Dyson 
685fd3bf775SJohn Dyson 		/*
686fd3bf775SJohn Dyson 		 * Take daemon off of free queue
687fd3bf775SJohn Dyson 		 */
6882244ea07SJohn Dyson 		if (aiop->aioprocflags & AIOP_FREE) {
689bfbbc4aaSJason Evans 			s = splnet();
6902244ea07SJohn Dyson 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
6912244ea07SJohn Dyson 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
6922244ea07SJohn Dyson 			aiop->aioprocflags &= ~AIOP_FREE;
693bfbbc4aaSJason Evans 			splx(s);
6942244ea07SJohn Dyson 		}
695fd3bf775SJohn Dyson 		aiop->aioprocflags &= ~AIOP_SCHED;
6962244ea07SJohn Dyson 
697fd3bf775SJohn Dyson 		/*
698bfbbc4aaSJason Evans 		 * Check for jobs.
699fd3bf775SJohn Dyson 		 */
700d254af07SMatthew Dillon 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
7012244ea07SJohn Dyson 			cb = &aiocbe->uaiocb;
7022244ea07SJohn Dyson 			userp = aiocbe->userproc;
7032244ea07SJohn Dyson 
7042244ea07SJohn Dyson 			aiocbe->jobstate = JOBST_JOBRUNNING;
705fd3bf775SJohn Dyson 
706fd3bf775SJohn Dyson 			/*
707bfbbc4aaSJason Evans 			 * Connect to process address space for user program.
708fd3bf775SJohn Dyson 			 */
709fd3bf775SJohn Dyson 			if (userp != curcp) {
710fd3bf775SJohn Dyson 				/*
711bfbbc4aaSJason Evans 				 * Save the current address space that we are
712bfbbc4aaSJason Evans 				 * connected to.
713fd3bf775SJohn Dyson 				 */
714fd3bf775SJohn Dyson 				tmpvm = mycp->p_vmspace;
715bfbbc4aaSJason Evans 
716fd3bf775SJohn Dyson 				/*
717bfbbc4aaSJason Evans 				 * Point to the new user address space, and
718bfbbc4aaSJason Evans 				 * refer to it.
719fd3bf775SJohn Dyson 				 */
720fd3bf775SJohn Dyson 				mycp->p_vmspace = userp->p_vmspace;
72184af4da6SJohn Dyson 				mycp->p_vmspace->vm_refcnt++;
722bfbbc4aaSJason Evans 
723bfbbc4aaSJason Evans 				/* Activate the new mapping. */
724fd3bf775SJohn Dyson 				pmap_activate(mycp);
725bfbbc4aaSJason Evans 
726fd3bf775SJohn Dyson 				/*
727bfbbc4aaSJason Evans 				 * If the old address space wasn't the daemons
728bfbbc4aaSJason Evans 				 * own address space, then we need to remove the
729bfbbc4aaSJason Evans 				 * daemon's reference from the other process
730bfbbc4aaSJason Evans 				 * that it was acting on behalf of.
731fd3bf775SJohn Dyson 				 */
7322244ea07SJohn Dyson 				if (tmpvm != myvm) {
7332244ea07SJohn Dyson 					vmspace_free(tmpvm);
7342244ea07SJohn Dyson 				}
735bfbbc4aaSJason Evans 
736fd3bf775SJohn Dyson 				/*
737bfbbc4aaSJason Evans 				 * Disassociate from previous clients file
738bfbbc4aaSJason Evans 				 * descriptors, and associate to the new clients
739bfbbc4aaSJason Evans 				 * descriptors.  Note that the daemon doesn't
740bfbbc4aaSJason Evans 				 * need to worry about its orginal descriptors,
741bfbbc4aaSJason Evans 				 * because they were originally freed.
742fd3bf775SJohn Dyson 				 */
743fd3bf775SJohn Dyson 				if (mycp->p_fd)
744fd3bf775SJohn Dyson 					fdfree(mycp);
745fd3bf775SJohn Dyson 				mycp->p_fd = fdshare(userp);
746fd3bf775SJohn Dyson 				curcp = userp;
7472244ea07SJohn Dyson 			}
7482244ea07SJohn Dyson 
749fd3bf775SJohn Dyson 			ki = userp->p_aioinfo;
75084af4da6SJohn Dyson 			lj = aiocbe->lio;
75184af4da6SJohn Dyson 
752bfbbc4aaSJason Evans 			/* Account for currently active jobs. */
7532244ea07SJohn Dyson 			ki->kaio_active_count++;
75411783b14SJohn Dyson 
755bfbbc4aaSJason Evans 			/* Do the I/O function. */
7562244ea07SJohn Dyson 			aiocbe->jobaioproc = aiop;
7572244ea07SJohn Dyson 			aio_process(aiocbe);
75884af4da6SJohn Dyson 
759bfbbc4aaSJason Evans 			/* Decrement the active job count. */
76084af4da6SJohn Dyson 			ki->kaio_active_count--;
76184af4da6SJohn Dyson 
76284af4da6SJohn Dyson 			/*
763bfbbc4aaSJason Evans 			 * Increment the completion count for wakeup/signal
764bfbbc4aaSJason Evans 			 * comparisons.
76584af4da6SJohn Dyson 			 */
76684af4da6SJohn Dyson 			aiocbe->jobflags |= AIOCBLIST_DONE;
76784af4da6SJohn Dyson 			ki->kaio_queue_finished_count++;
768bfbbc4aaSJason Evans 			if (lj)
76984af4da6SJohn Dyson 				lj->lioj_queue_finished_count++;
770bfbbc4aaSJason Evans 			if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
771bfbbc4aaSJason Evans 			    & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
772fd3bf775SJohn Dyson 				ki->kaio_flags &= ~KAIO_WAKEUP;
773fd3bf775SJohn Dyson 				wakeup(userp);
774fd3bf775SJohn Dyson 			}
7752244ea07SJohn Dyson 
77684af4da6SJohn Dyson 			s = splbio();
777bfbbc4aaSJason Evans 			if (lj && (lj->lioj_flags &
778bfbbc4aaSJason Evans 			    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
779bfbbc4aaSJason Evans 				if ((lj->lioj_queue_finished_count ==
780bfbbc4aaSJason Evans 				    lj->lioj_queue_count) &&
781bfbbc4aaSJason Evans 				    (lj->lioj_buffer_finished_count ==
782bfbbc4aaSJason Evans 				    lj->lioj_buffer_count)) {
783bfbbc4aaSJason Evans 						psignal(userp,
784bfbbc4aaSJason Evans 						    lj->lioj_signal.sigev_signo);
785bfbbc4aaSJason Evans 						lj->lioj_flags |=
786bfbbc4aaSJason Evans 						    LIOJ_SIGNAL_POSTED;
78784af4da6SJohn Dyson 				}
78884af4da6SJohn Dyson 			}
78984af4da6SJohn Dyson 			splx(s);
79084af4da6SJohn Dyson 
7912244ea07SJohn Dyson 			aiocbe->jobstate = JOBST_JOBFINISHED;
7922244ea07SJohn Dyson 
793fd3bf775SJohn Dyson 			/*
794bfbbc4aaSJason Evans 			 * If the I/O request should be automatically rundown,
795bfbbc4aaSJason Evans 			 * do the needed cleanup.  Otherwise, place the queue
796bfbbc4aaSJason Evans 			 * entry for the just finished I/O request into the done
797bfbbc4aaSJason Evans 			 * queue for the associated client.
798fd3bf775SJohn Dyson 			 */
799bfbbc4aaSJason Evans 			s = splnet();
8002244ea07SJohn Dyson 			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
8012244ea07SJohn Dyson 				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
8022244ea07SJohn Dyson 				TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
8032244ea07SJohn Dyson 			} else {
804bfbbc4aaSJason Evans 				TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
805bfbbc4aaSJason Evans 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
806bfbbc4aaSJason Evans 				    plist);
8072244ea07SJohn Dyson 			}
808bfbbc4aaSJason Evans 			splx(s);
8092244ea07SJohn Dyson 
8102244ea07SJohn Dyson 			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
8112244ea07SJohn Dyson 				wakeup(aiocbe);
8122244ea07SJohn Dyson 				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
8132244ea07SJohn Dyson 			}
8142244ea07SJohn Dyson 
8152244ea07SJohn Dyson 			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
8162244ea07SJohn Dyson 				psignal(userp, cb->aio_sigevent.sigev_signo);
8172244ea07SJohn Dyson 			}
8182244ea07SJohn Dyson 		}
8192244ea07SJohn Dyson 
820fd3bf775SJohn Dyson 		/*
821bfbbc4aaSJason Evans 		 * Disconnect from user address space.
822fd3bf775SJohn Dyson 		 */
823fd3bf775SJohn Dyson 		if (curcp != mycp) {
824bfbbc4aaSJason Evans 			/* Get the user address space to disconnect from. */
825fd3bf775SJohn Dyson 			tmpvm = mycp->p_vmspace;
826bfbbc4aaSJason Evans 
827bfbbc4aaSJason Evans 			/* Get original address space for daemon. */
828fd3bf775SJohn Dyson 			mycp->p_vmspace = myvm;
829bfbbc4aaSJason Evans 
830bfbbc4aaSJason Evans 			/* Activate the daemon's address space. */
831fd3bf775SJohn Dyson 			pmap_activate(mycp);
832bfbbc4aaSJason Evans #ifdef DIAGNOSTIC
833bfbbc4aaSJason Evans 			if (tmpvm == myvm) {
834bfbbc4aaSJason Evans 				printf("AIOD: vmspace problem -- %d\n",
835bfbbc4aaSJason Evans 				    mycp->p_pid);
836bfbbc4aaSJason Evans 			}
83711783b14SJohn Dyson #endif
838bfbbc4aaSJason Evans 			/* Remove our vmspace reference. */
8392244ea07SJohn Dyson 			vmspace_free(tmpvm);
840bfbbc4aaSJason Evans 
841fd3bf775SJohn Dyson 			/*
842bfbbc4aaSJason Evans 			 * Disassociate from the user process's file
843bfbbc4aaSJason Evans 			 * descriptors.
844fd3bf775SJohn Dyson 			 */
845fd3bf775SJohn Dyson 			if (mycp->p_fd)
846fd3bf775SJohn Dyson 				fdfree(mycp);
847fd3bf775SJohn Dyson 			mycp->p_fd = NULL;
848fd3bf775SJohn Dyson 			curcp = mycp;
849fd3bf775SJohn Dyson 		}
850fd3bf775SJohn Dyson 
851fd3bf775SJohn Dyson 		/*
852fd3bf775SJohn Dyson 		 * If we are the first to be put onto the free queue, wakeup
853fd3bf775SJohn Dyson 		 * anyone waiting for a daemon.
854fd3bf775SJohn Dyson 		 */
855bfbbc4aaSJason Evans 		s = splnet();
856fd3bf775SJohn Dyson 		TAILQ_REMOVE(&aio_activeproc, aiop, list);
857fd3bf775SJohn Dyson 		if (TAILQ_EMPTY(&aio_freeproc))
858fd3bf775SJohn Dyson 			wakeup(&aio_freeproc);
859fd3bf775SJohn Dyson 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
860fd3bf775SJohn Dyson 		aiop->aioprocflags |= AIOP_FREE;
861bfbbc4aaSJason Evans 		splx(s);
862fd3bf775SJohn Dyson 
863fd3bf775SJohn Dyson 		/*
864bfbbc4aaSJason Evans 		 * If daemon is inactive for a long time, allow it to exit,
865bfbbc4aaSJason Evans 		 * thereby freeing resources.
866fd3bf775SJohn Dyson 		 */
867bfbbc4aaSJason Evans 		if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp,
868bfbbc4aaSJason Evans 		    PRIBIO, "aiordy", aiod_lifetime)) {
869bfbbc4aaSJason Evans 			s = splnet();
870fd3bf775SJohn Dyson 			if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
871fd3bf775SJohn Dyson 			    (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
87284af4da6SJohn Dyson 				if ((aiop->aioprocflags & AIOP_FREE) &&
87384af4da6SJohn Dyson 				    (num_aio_procs > target_aio_procs)) {
874fd3bf775SJohn Dyson 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
875bfbbc4aaSJason Evans 					splx(s);
876fd3bf775SJohn Dyson 					zfree(aiop_zone, aiop);
87784af4da6SJohn Dyson 					num_aio_procs--;
878bfbbc4aaSJason Evans #ifdef DIAGNOSTIC
879bfbbc4aaSJason Evans 					if (mycp->p_vmspace->vm_refcnt <= 1) {
880bfbbc4aaSJason Evans 						printf("AIOD: bad vm refcnt for"
881bfbbc4aaSJason Evans 						    " exiting daemon: %d\n",
882fd3bf775SJohn Dyson 						    mycp->p_vmspace->vm_refcnt);
883bfbbc4aaSJason Evans 					}
88411783b14SJohn Dyson #endif
885fd3bf775SJohn Dyson 					exit1(mycp, 0);
886fd3bf775SJohn Dyson 				}
887fd3bf775SJohn Dyson 			}
888bfbbc4aaSJason Evans 			splx(s);
8892244ea07SJohn Dyson 		}
8902244ea07SJohn Dyson 	}
8912244ea07SJohn Dyson }
8922244ea07SJohn Dyson 
8932244ea07SJohn Dyson /*
894bfbbc4aaSJason Evans  * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
895bfbbc4aaSJason Evans  * AIO daemon modifies its environment itself.
8962244ea07SJohn Dyson  */
8972244ea07SJohn Dyson static int
898fd3bf775SJohn Dyson aio_newproc()
899fd3bf775SJohn Dyson {
9002244ea07SJohn Dyson 	int error;
901fd3bf775SJohn Dyson 	struct proc *p, *np;
9022244ea07SJohn Dyson 
9035206bca1SLuoqi Chen 	p = &proc0;
904df8abd0bSPeter Wemm 	error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np);
905d5558c00SPeter Wemm 	if (error)
9062244ea07SJohn Dyson 		return error;
9075206bca1SLuoqi Chen 	cpu_set_fork_handler(np, aio_daemon, curproc);
9082244ea07SJohn Dyson 
909fd3bf775SJohn Dyson 	/*
910bfbbc4aaSJason Evans 	 * Wait until daemon is started, but continue on just in case to
911fd3bf775SJohn Dyson 	 * handle error conditions.
912fd3bf775SJohn Dyson 	 */
91384af4da6SJohn Dyson 	error = tsleep(np, PZERO, "aiosta", aiod_timeout);
91484af4da6SJohn Dyson 	num_aio_procs++;
9152244ea07SJohn Dyson 
9162244ea07SJohn Dyson 	return error;
9172244ea07SJohn Dyson }
9182244ea07SJohn Dyson 
9192244ea07SJohn Dyson /*
92084af4da6SJohn Dyson  * Try the high-performance physio method for eligible VCHR devices.  This
921bfbbc4aaSJason Evans  * routine doesn't require the use of any additional threads, and have overhead.
922fd3bf775SJohn Dyson  */
923fd3bf775SJohn Dyson int
924bfbbc4aaSJason Evans aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
925fd3bf775SJohn Dyson {
926fd3bf775SJohn Dyson 	int error;
927fd3bf775SJohn Dyson 	struct aiocb *cb;
928fd3bf775SJohn Dyson 	struct file *fp;
929fd3bf775SJohn Dyson 	struct buf *bp;
930fd3bf775SJohn Dyson 	struct vnode *vp;
931fd3bf775SJohn Dyson 	struct kaioinfo *ki;
932fd3bf775SJohn Dyson 	struct filedesc *fdp;
93384af4da6SJohn Dyson 	struct aio_liojob *lj;
934fd3bf775SJohn Dyson 	int fd;
935fd3bf775SJohn Dyson 	int s;
936fd3bf775SJohn Dyson 	int cnt;
937fd3bf775SJohn Dyson 
93884af4da6SJohn Dyson 	cb = &aiocbe->uaiocb;
939fd3bf775SJohn Dyson 	fdp = p->p_fd;
940fd3bf775SJohn Dyson 	fd = cb->aio_fildes;
941fd3bf775SJohn Dyson 	fp = fdp->fd_ofiles[fd];
942fd3bf775SJohn Dyson 
943008626c3SPoul-Henning Kamp 	if (fp->f_type != DTYPE_VNODE)
944008626c3SPoul-Henning Kamp 		return (-1);
945fd3bf775SJohn Dyson 
946fd3bf775SJohn Dyson 	vp = (struct vnode *)fp->f_data;
94711783b14SJohn Dyson 
948f582ac06SBrian Feldman 	/*
949f582ac06SBrian Feldman 	 * If its not a disk, we don't want to return a positive error.
950f582ac06SBrian Feldman 	 * It causes the aio code to not fall through to try the thread
951f582ac06SBrian Feldman 	 * way when you're talking to a regular file.
952f582ac06SBrian Feldman 	 */
953f582ac06SBrian Feldman 	if (!vn_isdisk(vp, &error)) {
954f582ac06SBrian Feldman 		if (error == ENOTBLK)
955f582ac06SBrian Feldman 			return (-1);
956f582ac06SBrian Feldman 		else
957ba4ad1fcSPoul-Henning Kamp 			return (error);
958f582ac06SBrian Feldman 	}
959fd3bf775SJohn Dyson 
960008626c3SPoul-Henning Kamp  	if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
961008626c3SPoul-Henning Kamp 		return (-1);
962fd3bf775SJohn Dyson 
963008626c3SPoul-Henning Kamp 	if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio))
964008626c3SPoul-Henning Kamp 		return (-1);
965fd3bf775SJohn Dyson 
966fd3bf775SJohn Dyson 	ki = p->p_aioinfo;
967008626c3SPoul-Henning Kamp 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
968008626c3SPoul-Henning Kamp 		return (-1);
969fd3bf775SJohn Dyson 
970fd3bf775SJohn Dyson 	cnt = cb->aio_nbytes;
971008626c3SPoul-Henning Kamp 	if (cnt > MAXPHYS)
972008626c3SPoul-Henning Kamp 		return (-1);
97384af4da6SJohn Dyson 
97484af4da6SJohn Dyson 	/*
975bfbbc4aaSJason Evans 	 * Physical I/O is charged directly to the process, so we don't have to
976bfbbc4aaSJason Evans 	 * fake it.
97784af4da6SJohn Dyson 	 */
97884af4da6SJohn Dyson 	aiocbe->inputcharge = 0;
97984af4da6SJohn Dyson 	aiocbe->outputcharge = 0;
980fd3bf775SJohn Dyson 
981fd3bf775SJohn Dyson 	ki->kaio_buffer_count++;
98211783b14SJohn Dyson 
98311783b14SJohn Dyson 	lj = aiocbe->lio;
984bfbbc4aaSJason Evans 	if (lj)
98584af4da6SJohn Dyson 		lj->lioj_buffer_count++;
986fd3bf775SJohn Dyson 
987bfbbc4aaSJason Evans 	/* Create and build a buffer header for a transfer. */
9881c7c3c6aSMatthew Dillon 	bp = (struct buf *)getpbuf(NULL);
989fd3bf775SJohn Dyson 
990fd3bf775SJohn Dyson 	/*
991bfbbc4aaSJason Evans 	 * Get a copy of the kva from the physical buffer.
992fd3bf775SJohn Dyson 	 */
993b0eeea20SPoul-Henning Kamp 	bp->b_caller1 = p;
99449ff4debSPoul-Henning Kamp 	bp->b_dev = vp->v_rdev;
995fd3bf775SJohn Dyson 	error = bp->b_error = 0;
996fd3bf775SJohn Dyson 
997fd3bf775SJohn Dyson 	bp->b_bcount = cb->aio_nbytes;
998fd3bf775SJohn Dyson 	bp->b_bufsize = cb->aio_nbytes;
99902c58685SPoul-Henning Kamp 	bp->b_flags = B_PHYS | B_CALL;
1000fd3bf775SJohn Dyson 	bp->b_iodone = aio_physwakeup;
1001fd3bf775SJohn Dyson 	bp->b_saveaddr = bp->b_data;
100264889941SJohn Dyson 	bp->b_data = (void *)cb->aio_buf;
1003fd3bf775SJohn Dyson 	bp->b_blkno = btodb(cb->aio_offset);
1004fd3bf775SJohn Dyson 
100502c58685SPoul-Henning Kamp 	if (cb->aio_lio_opcode == LIO_WRITE) {
100602c58685SPoul-Henning Kamp 		bp->b_flags |= B_WRITE;
100702c58685SPoul-Henning Kamp 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1008fd3bf775SJohn Dyson 			error = EFAULT;
1009fd3bf775SJohn Dyson 			goto doerror;
1010fd3bf775SJohn Dyson 		}
101102c58685SPoul-Henning Kamp 	} else {
101202c58685SPoul-Henning Kamp 		bp->b_flags |= B_READ;
101302c58685SPoul-Henning Kamp 		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1014fd3bf775SJohn Dyson 			error = EFAULT;
1015fd3bf775SJohn Dyson 			goto doerror;
1016fd3bf775SJohn Dyson 		}
101702c58685SPoul-Henning Kamp 	}
1018fd3bf775SJohn Dyson 
1019bfbbc4aaSJason Evans 	/* Bring buffer into kernel space. */
1020fd3bf775SJohn Dyson 	vmapbuf(bp);
1021fd3bf775SJohn Dyson 
102284af4da6SJohn Dyson 	s = splbio();
1023fd3bf775SJohn Dyson 	aiocbe->bp = bp;
1024fd3bf775SJohn Dyson 	bp->b_spc = (void *)aiocbe;
1025fd3bf775SJohn Dyson 	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
102684af4da6SJohn Dyson 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1027fd3bf775SJohn Dyson 	aiocbe->jobstate = JOBST_JOBQBUF;
102884af4da6SJohn Dyson 	cb->_aiocb_private.status = cb->aio_nbytes;
102984af4da6SJohn Dyson 	num_buf_aio++;
1030fd3bf775SJohn Dyson 	bp->b_error = 0;
1031fd3bf775SJohn Dyson 
103284af4da6SJohn Dyson 	splx(s);
1033bfbbc4aaSJason Evans 
1034bfbbc4aaSJason Evans 	/* Perform transfer. */
103549ff4debSPoul-Henning Kamp 	BUF_STRATEGY(bp, 0);
1036fd3bf775SJohn Dyson 
103784af4da6SJohn Dyson 	s = splbio();
1038bfbbc4aaSJason Evans 
103911783b14SJohn Dyson 	/*
104011783b14SJohn Dyson 	 * If we had an error invoking the request, or an error in processing
1041bfbbc4aaSJason Evans 	 * the request before we have returned, we process it as an error in
1042bfbbc4aaSJason Evans 	 * transfer.  Note that such an I/O error is not indicated immediately,
1043bfbbc4aaSJason Evans 	 * but is returned using the aio_error mechanism.  In this case,
1044bfbbc4aaSJason Evans 	 * aio_suspend will return immediately.
104511783b14SJohn Dyson 	 */
104611783b14SJohn Dyson 	if (bp->b_error || (bp->b_flags & B_ERROR)) {
104711783b14SJohn Dyson 		struct aiocb *job = aiocbe->uuaiocb;
104811783b14SJohn Dyson 
104911783b14SJohn Dyson 		aiocbe->uaiocb._aiocb_private.status = 0;
105011783b14SJohn Dyson 		suword(&job->_aiocb_private.status, 0);
105111783b14SJohn Dyson 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
105211783b14SJohn Dyson 		suword(&job->_aiocb_private.error, bp->b_error);
105311783b14SJohn Dyson 
105411783b14SJohn Dyson 		ki->kaio_buffer_finished_count++;
105511783b14SJohn Dyson 
105611783b14SJohn Dyson 		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
105711783b14SJohn Dyson 			aiocbe->jobstate = JOBST_JOBBFINISHED;
105811783b14SJohn Dyson 			aiocbe->jobflags |= AIOCBLIST_DONE;
1059fd3bf775SJohn Dyson 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
106084af4da6SJohn Dyson 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
106111783b14SJohn Dyson 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
106211783b14SJohn Dyson 		}
106311783b14SJohn Dyson 	}
106484af4da6SJohn Dyson 	splx(s);
1065fd3bf775SJohn Dyson 	return 0;
1066fd3bf775SJohn Dyson 
1067fd3bf775SJohn Dyson doerror:
1068fd3bf775SJohn Dyson 	ki->kaio_buffer_count--;
1069bfbbc4aaSJason Evans 	if (lj)
107084af4da6SJohn Dyson 		lj->lioj_buffer_count--;
107184af4da6SJohn Dyson 	aiocbe->bp = NULL;
10721c7c3c6aSMatthew Dillon 	relpbuf(bp, NULL);
1073fd3bf775SJohn Dyson 	return error;
1074fd3bf775SJohn Dyson }
1075fd3bf775SJohn Dyson 
107684af4da6SJohn Dyson /*
107784af4da6SJohn Dyson  * This waits/tests physio completion.
107884af4da6SJohn Dyson  */
1079fd3bf775SJohn Dyson int
1080bfbbc4aaSJason Evans aio_fphysio(struct proc *p, struct aiocblist *iocb, int flgwait)
1081fd3bf775SJohn Dyson {
1082fd3bf775SJohn Dyson 	int s;
1083fd3bf775SJohn Dyson 	struct buf *bp;
1084fd3bf775SJohn Dyson 	int error;
1085fd3bf775SJohn Dyson 
1086fd3bf775SJohn Dyson 	bp = iocb->bp;
1087fd3bf775SJohn Dyson 
1088fd3bf775SJohn Dyson 	s = splbio();
1089fd3bf775SJohn Dyson 	if (flgwait == 0) {
1090fd3bf775SJohn Dyson 		if ((bp->b_flags & B_DONE) == 0) {
1091fd3bf775SJohn Dyson 			splx(s);
1092fd3bf775SJohn Dyson 			return EINPROGRESS;
1093fd3bf775SJohn Dyson 		}
1094fd3bf775SJohn Dyson 	}
1095fd3bf775SJohn Dyson 
1096fd3bf775SJohn Dyson 	while ((bp->b_flags & B_DONE) == 0) {
109784af4da6SJohn Dyson 		if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) {
1098fd3bf775SJohn Dyson 			if ((bp->b_flags & B_DONE) == 0) {
1099fd3bf775SJohn Dyson 				splx(s);
1100fd3bf775SJohn Dyson 				return EINPROGRESS;
1101bfbbc4aaSJason Evans 			} else
1102fd3bf775SJohn Dyson 				break;
1103fd3bf775SJohn Dyson 		}
1104fd3bf775SJohn Dyson 	}
1105fd3bf775SJohn Dyson 
1106bfbbc4aaSJason Evans 	/* Release mapping into kernel space. */
1107fd3bf775SJohn Dyson 	vunmapbuf(bp);
1108fd3bf775SJohn Dyson 	iocb->bp = 0;
1109fd3bf775SJohn Dyson 
1110fd3bf775SJohn Dyson 	error = 0;
1111bfbbc4aaSJason Evans 
1112bfbbc4aaSJason Evans 	/* Check for an error. */
1113bfbbc4aaSJason Evans 	if (bp->b_flags & B_ERROR)
1114fd3bf775SJohn Dyson 		error = bp->b_error;
1115fd3bf775SJohn Dyson 
11161c7c3c6aSMatthew Dillon 	relpbuf(bp, NULL);
1117fd3bf775SJohn Dyson 	return (error);
1118fd3bf775SJohn Dyson }
1119fd3bf775SJohn Dyson 
1120fd3bf775SJohn Dyson /*
1121bfbbc4aaSJason Evans  * Wake up aio requests that may be serviceable now.
1122bfbbc4aaSJason Evans  */
1123bfbbc4aaSJason Evans void
1124bfbbc4aaSJason Evans aio_swake(struct socket *so, struct sockbuf *sb)
1125bfbbc4aaSJason Evans {
1126bfbbc4aaSJason Evans 	struct aiocblist *cb,*cbn;
1127bfbbc4aaSJason Evans 	struct proc *p;
1128bfbbc4aaSJason Evans 	struct kaioinfo *ki = NULL;
1129bfbbc4aaSJason Evans 	int opcode, wakecount = 0;
1130bfbbc4aaSJason Evans 	struct aioproclist *aiop;
1131bfbbc4aaSJason Evans 
1132bfbbc4aaSJason Evans 	if (sb == &so->so_snd) {
1133bfbbc4aaSJason Evans 		opcode = LIO_WRITE;
1134bfbbc4aaSJason Evans 		so->so_snd.sb_flags &= ~SB_AIO;
1135bfbbc4aaSJason Evans 	} else {
1136bfbbc4aaSJason Evans 		opcode = LIO_READ;
1137bfbbc4aaSJason Evans 		so->so_rcv.sb_flags &= ~SB_AIO;
1138bfbbc4aaSJason Evans 	}
1139bfbbc4aaSJason Evans 
1140bfbbc4aaSJason Evans 	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1141bfbbc4aaSJason Evans 		cbn = TAILQ_NEXT(cb, list);
1142bfbbc4aaSJason Evans 		if (opcode == cb->uaiocb.aio_lio_opcode) {
1143bfbbc4aaSJason Evans 			p = cb->userproc;
1144bfbbc4aaSJason Evans 			ki = p->p_aioinfo;
1145bfbbc4aaSJason Evans 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1146bfbbc4aaSJason Evans 			TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1147bfbbc4aaSJason Evans 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1148bfbbc4aaSJason Evans 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1149bfbbc4aaSJason Evans 			wakecount++;
1150bfbbc4aaSJason Evans 			if (cb->jobstate != JOBST_JOBQGLOBAL)
1151bfbbc4aaSJason Evans 				panic("invalid queue value");
1152bfbbc4aaSJason Evans 		}
1153bfbbc4aaSJason Evans 	}
1154bfbbc4aaSJason Evans 
1155bfbbc4aaSJason Evans 	while (wakecount--) {
1156bfbbc4aaSJason Evans 		if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1157bfbbc4aaSJason Evans 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1158bfbbc4aaSJason Evans 			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1159bfbbc4aaSJason Evans 			aiop->aioprocflags &= ~AIOP_FREE;
1160bfbbc4aaSJason Evans 			wakeup(aiop->aioproc);
1161bfbbc4aaSJason Evans 		}
1162bfbbc4aaSJason Evans 	}
1163bfbbc4aaSJason Evans }
1164bfbbc4aaSJason Evans 
1165bfbbc4aaSJason Evans /*
1166bfbbc4aaSJason Evans  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1167bfbbc4aaSJason Evans  * technique is done in this code.
11682244ea07SJohn Dyson  */
11692244ea07SJohn Dyson static int
117084af4da6SJohn Dyson _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
1171fd3bf775SJohn Dyson {
11722244ea07SJohn Dyson 	struct filedesc *fdp;
11732244ea07SJohn Dyson 	struct file *fp;
11742244ea07SJohn Dyson 	unsigned int fd;
1175bfbbc4aaSJason Evans 	struct socket *so;
1176bfbbc4aaSJason Evans 	int s;
11772244ea07SJohn Dyson 	int error;
11782244ea07SJohn Dyson 	int opcode;
11792244ea07SJohn Dyson 	struct aiocblist *aiocbe;
11802244ea07SJohn Dyson 	struct aioproclist *aiop;
11812244ea07SJohn Dyson 	struct kaioinfo *ki;
11822244ea07SJohn Dyson 
1183bfbbc4aaSJason Evans 	if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL)
11842244ea07SJohn Dyson 		TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
1185bfbbc4aaSJason Evans 	else
1186fd3bf775SJohn Dyson 		aiocbe = zalloc (aiocb_zone);
11872244ea07SJohn Dyson 
1188fd3bf775SJohn Dyson 	aiocbe->inputcharge = 0;
1189fd3bf775SJohn Dyson 	aiocbe->outputcharge = 0;
1190fd3bf775SJohn Dyson 
1191fd3bf775SJohn Dyson 	suword(&job->_aiocb_private.status, -1);
1192fd3bf775SJohn Dyson 	suword(&job->_aiocb_private.error, 0);
1193fd3bf775SJohn Dyson 	suword(&job->_aiocb_private.kernelinfo, -1);
1194fd3bf775SJohn Dyson 
1195bfbbc4aaSJason Evans 	error = copyin((caddr_t)job, (caddr_t) &aiocbe->uaiocb, sizeof
1196bfbbc4aaSJason Evans 	    aiocbe->uaiocb);
11972244ea07SJohn Dyson 	if (error) {
1198fd3bf775SJohn Dyson 		suword(&job->_aiocb_private.error, error);
1199fd3bf775SJohn Dyson 
12002244ea07SJohn Dyson 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
12012244ea07SJohn Dyson 		return error;
12022244ea07SJohn Dyson 	}
12032244ea07SJohn Dyson 
1204bfbbc4aaSJason Evans 	/* Save userspace address of the job info. */
120511783b14SJohn Dyson 	aiocbe->uuaiocb = job;
120611783b14SJohn Dyson 
1207bfbbc4aaSJason Evans 	/* Get the opcode. */
1208bfbbc4aaSJason Evans 	if (type != LIO_NOP)
1209a624e84fSJohn Dyson 		aiocbe->uaiocb.aio_lio_opcode = type;
1210a624e84fSJohn Dyson 	opcode = aiocbe->uaiocb.aio_lio_opcode;
12112244ea07SJohn Dyson 
1212bfbbc4aaSJason Evans 	/* Get the fd info for process. */
12132244ea07SJohn Dyson 	fdp = p->p_fd;
12142244ea07SJohn Dyson 
12152244ea07SJohn Dyson 	/*
1216bfbbc4aaSJason Evans 	 * Range check file descriptor.
12172244ea07SJohn Dyson 	 */
12182244ea07SJohn Dyson 	fd = aiocbe->uaiocb.aio_fildes;
12192244ea07SJohn Dyson 	if (fd >= fdp->fd_nfiles) {
12202244ea07SJohn Dyson 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1221bfbbc4aaSJason Evans 		if (type == 0)
12222244ea07SJohn Dyson 			suword(&job->_aiocb_private.error, EBADF);
12232244ea07SJohn Dyson 		return EBADF;
12242244ea07SJohn Dyson 	}
12252244ea07SJohn Dyson 
1226bfbbc4aaSJason Evans 	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1227bfbbc4aaSJason Evans 	if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1228bfbbc4aaSJason Evans 	    0))) {
12292244ea07SJohn Dyson 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1230bfbbc4aaSJason Evans 		if (type == 0)
12312244ea07SJohn Dyson 			suword(&job->_aiocb_private.error, EBADF);
12322244ea07SJohn Dyson 		return EBADF;
12332244ea07SJohn Dyson 	}
12342244ea07SJohn Dyson 
12352244ea07SJohn Dyson 	if (aiocbe->uaiocb.aio_offset == -1LL) {
12362244ea07SJohn Dyson 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1237bfbbc4aaSJason Evans 		if (type == 0)
12382244ea07SJohn Dyson 			suword(&job->_aiocb_private.error, EINVAL);
12392244ea07SJohn Dyson 		return EINVAL;
12402244ea07SJohn Dyson 	}
12412244ea07SJohn Dyson 
12422244ea07SJohn Dyson 	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
12432244ea07SJohn Dyson 	if (error) {
12442244ea07SJohn Dyson 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1245bfbbc4aaSJason Evans 		if (type == 0)
12462244ea07SJohn Dyson 			suword(&job->_aiocb_private.error, EINVAL);
12472244ea07SJohn Dyson 		return error;
12482244ea07SJohn Dyson 	}
12492244ea07SJohn Dyson 
125030166fabSBruce Evans 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
12512d2f8ae7SBruce Evans 	if (jobrefid == LONG_MAX)
1252fd3bf775SJohn Dyson 		jobrefid = 1;
12532d2f8ae7SBruce Evans 	else
12542d2f8ae7SBruce Evans 		jobrefid++;
12552244ea07SJohn Dyson 
12562244ea07SJohn Dyson 	if (opcode == LIO_NOP) {
12572244ea07SJohn Dyson 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
12582244ea07SJohn Dyson 		if (type == 0) {
12592244ea07SJohn Dyson 			suword(&job->_aiocb_private.error, 0);
1260fd3bf775SJohn Dyson 			suword(&job->_aiocb_private.status, 0);
1261fd3bf775SJohn Dyson 			suword(&job->_aiocb_private.kernelinfo, 0);
12622244ea07SJohn Dyson 		}
12632244ea07SJohn Dyson 		return 0;
12642244ea07SJohn Dyson 	}
12652244ea07SJohn Dyson 
1266fd3bf775SJohn Dyson 	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
12672244ea07SJohn Dyson 		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
12682244ea07SJohn Dyson 		if (type == 0) {
1269fd3bf775SJohn Dyson 			suword(&job->_aiocb_private.status, 0);
12702244ea07SJohn Dyson 			suword(&job->_aiocb_private.error, EINVAL);
12712244ea07SJohn Dyson 		}
12722244ea07SJohn Dyson 		return EINVAL;
12732244ea07SJohn Dyson 	}
12742244ea07SJohn Dyson 
1275fd3bf775SJohn Dyson 	suword(&job->_aiocb_private.error, EINPROGRESS);
1276fd3bf775SJohn Dyson 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
12772244ea07SJohn Dyson 	aiocbe->userproc = p;
12782244ea07SJohn Dyson 	aiocbe->jobflags = 0;
127984af4da6SJohn Dyson 	aiocbe->lio = lj;
128084af4da6SJohn Dyson 	ki = p->p_aioinfo;
12812244ea07SJohn Dyson 
1282bfbbc4aaSJason Evans 	if (fp->f_type == DTYPE_SOCKET) {
1283bfbbc4aaSJason Evans 		/*
1284bfbbc4aaSJason Evans 		 * Alternate queueing for socket ops: Reach down into the
1285bfbbc4aaSJason Evans 		 * descriptor to get the socket data.  Then check to see if the
1286bfbbc4aaSJason Evans 		 * socket is ready to be read or written (based on the requested
1287bfbbc4aaSJason Evans 		 * operation).
1288bfbbc4aaSJason Evans 		 *
1289bfbbc4aaSJason Evans 		 * If it is not ready for io, then queue the aiocbe on the
1290bfbbc4aaSJason Evans 		 * socket, and set the flags so we get a call when sbnotify()
1291bfbbc4aaSJason Evans 		 * happens.
1292bfbbc4aaSJason Evans 		 */
1293bfbbc4aaSJason Evans 		so = (struct socket *)fp->f_data;
1294bfbbc4aaSJason Evans 		s = splnet();
1295bfbbc4aaSJason Evans 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1296bfbbc4aaSJason Evans 		    LIO_WRITE) && (!sowriteable(so)))) {
1297bfbbc4aaSJason Evans 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1298bfbbc4aaSJason Evans 			TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1299bfbbc4aaSJason Evans 			if (opcode == LIO_READ)
1300bfbbc4aaSJason Evans 				so->so_rcv.sb_flags |= SB_AIO;
1301bfbbc4aaSJason Evans 			else
1302bfbbc4aaSJason Evans 				so->so_snd.sb_flags |= SB_AIO;
1303bfbbc4aaSJason Evans 			aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1304bfbbc4aaSJason Evans 			ki->kaio_queue_count++;
1305bfbbc4aaSJason Evans 			num_queue_count++;
1306bfbbc4aaSJason Evans 			splx(s);
1307fd3bf775SJohn Dyson 			return 0;
1308bfbbc4aaSJason Evans 		}
1309bfbbc4aaSJason Evans 		splx(s);
1310bfbbc4aaSJason Evans 	}
1311bfbbc4aaSJason Evans 
1312bfbbc4aaSJason Evans 	if ((error = aio_qphysio(p, aiocbe)) == 0)
1313bfbbc4aaSJason Evans 		return 0;
1314bfbbc4aaSJason Evans 	else if (error > 0) {
1315fd3bf775SJohn Dyson 		suword(&job->_aiocb_private.status, 0);
1316fd3bf775SJohn Dyson 		aiocbe->uaiocb._aiocb_private.error = error;
1317fd3bf775SJohn Dyson 		suword(&job->_aiocb_private.error, error);
1318fd3bf775SJohn Dyson 		return error;
1319fd3bf775SJohn Dyson 	}
1320fd3bf775SJohn Dyson 
1321bfbbc4aaSJason Evans 	/* No buffer for daemon I/O. */
132284af4da6SJohn Dyson 	aiocbe->bp = NULL;
132384af4da6SJohn Dyson 
132484af4da6SJohn Dyson 	ki->kaio_queue_count++;
1325bfbbc4aaSJason Evans 	if (lj)
132684af4da6SJohn Dyson 		lj->lioj_queue_count++;
1327bfbbc4aaSJason Evans 	s = splnet();
1328fd3bf775SJohn Dyson 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1329fd3bf775SJohn Dyson 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1330bfbbc4aaSJason Evans 	splx(s);
1331fd3bf775SJohn Dyson 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1332fd3bf775SJohn Dyson 
133384af4da6SJohn Dyson 	num_queue_count++;
1334fd3bf775SJohn Dyson 	error = 0;
1335fd3bf775SJohn Dyson 
1336fd3bf775SJohn Dyson 	/*
1337bfbbc4aaSJason Evans 	 * If we don't have a free AIO process, and we are below our quota, then
1338bfbbc4aaSJason Evans 	 * start one.  Otherwise, depend on the subsequent I/O completions to
1339bfbbc4aaSJason Evans 	 * pick-up this job.  If we don't sucessfully create the new process
1340bfbbc4aaSJason Evans 	 * (thread) due to resource issues, we return an error for now (EAGAIN),
1341bfbbc4aaSJason Evans 	 * which is likely not the correct thing to do.
1342fd3bf775SJohn Dyson 	 */
13432244ea07SJohn Dyson retryproc:
1344bfbbc4aaSJason Evans 	s = splnet();
1345d254af07SMatthew Dillon 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
13462244ea07SJohn Dyson 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
13472244ea07SJohn Dyson 		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
13482244ea07SJohn Dyson 		aiop->aioprocflags &= ~AIOP_FREE;
13492244ea07SJohn Dyson 		wakeup(aiop->aioproc);
1350fd3bf775SJohn Dyson 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1351fd3bf775SJohn Dyson 	    ((ki->kaio_active_count + num_aio_resv_start) <
1352fd3bf775SJohn Dyson 	    ki->kaio_maxactive_count)) {
1353fd3bf775SJohn Dyson 		num_aio_resv_start++;
1354fd3bf775SJohn Dyson 		if ((error = aio_newproc()) == 0) {
135584af4da6SJohn Dyson 			num_aio_resv_start--;
1356e499ed6fSJohn Dyson 			p->p_retval[0] = 0;
13572244ea07SJohn Dyson 			goto retryproc;
1358fd3bf775SJohn Dyson 		}
135984af4da6SJohn Dyson 		num_aio_resv_start--;
1360fd3bf775SJohn Dyson 	}
1361bfbbc4aaSJason Evans 	splx(s);
1362fd3bf775SJohn Dyson 	return error;
13632244ea07SJohn Dyson }
13642244ea07SJohn Dyson 
1365fd3bf775SJohn Dyson /*
1366fd3bf775SJohn Dyson  * This routine queues an AIO request, checking for quotas.
1367fd3bf775SJohn Dyson  */
13682244ea07SJohn Dyson static int
1369fd3bf775SJohn Dyson aio_aqueue(struct proc *p, struct aiocb *job, int type)
1370fd3bf775SJohn Dyson {
13712244ea07SJohn Dyson 	struct kaioinfo *ki;
13722244ea07SJohn Dyson 
1373bfbbc4aaSJason Evans 	if (p->p_aioinfo == NULL)
13742244ea07SJohn Dyson 		aio_init_aioinfo(p);
13752244ea07SJohn Dyson 
13762244ea07SJohn Dyson 	if (num_queue_count >= max_queue_count)
13772244ea07SJohn Dyson 		return EAGAIN;
13782244ea07SJohn Dyson 
13792244ea07SJohn Dyson 	ki = p->p_aioinfo;
13802244ea07SJohn Dyson 	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
13812244ea07SJohn Dyson 		return EAGAIN;
13822244ea07SJohn Dyson 
138384af4da6SJohn Dyson 	return _aio_aqueue(p, job, NULL, type);
13842244ea07SJohn Dyson }
13852244ea07SJohn Dyson 
13862244ea07SJohn Dyson /*
1387bfbbc4aaSJason Evans  * Support the aio_return system call, as a side-effect, kernel resources are
1388bfbbc4aaSJason Evans  * released.
13892244ea07SJohn Dyson  */
13902244ea07SJohn Dyson int
1391fd3bf775SJohn Dyson aio_return(struct proc *p, struct aio_return_args *uap)
1392fd3bf775SJohn Dyson {
1393dd85920aSJason Evans #ifndef VFS_AIO
1394dd85920aSJason Evans 	return ENOSYS;
1395dd85920aSJason Evans #else
139684af4da6SJohn Dyson 	int s;
1397f5ef029eSPoul-Henning Kamp 	int jobref;
139884af4da6SJohn Dyson 	struct aiocblist *cb, *ncb;
139911783b14SJohn Dyson 	struct aiocb *ujob;
14002244ea07SJohn Dyson 	struct kaioinfo *ki;
14012244ea07SJohn Dyson 
14022244ea07SJohn Dyson 	ki = p->p_aioinfo;
1403bfbbc4aaSJason Evans 	if (ki == NULL)
14042244ea07SJohn Dyson 		return EINVAL;
14052244ea07SJohn Dyson 
140611783b14SJohn Dyson 	ujob = uap->aiocbp;
140711783b14SJohn Dyson 
140811783b14SJohn Dyson 	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1409fd3bf775SJohn Dyson 	if (jobref == -1 || jobref == 0)
14102244ea07SJohn Dyson 		return EINVAL;
14112244ea07SJohn Dyson 
1412bfbbc4aaSJason Evans 	s = splnet();
1413bfbbc4aaSJason Evans 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1414bfbbc4aaSJason Evans 	    plist)) {
1415bfbbc4aaSJason Evans 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1416bfbbc4aaSJason Evans 		    jobref) {
1417bfbbc4aaSJason Evans 			splx(s);
141811783b14SJohn Dyson 			if (ujob == cb->uuaiocb) {
1419bfbbc4aaSJason Evans 				p->p_retval[0] =
1420bfbbc4aaSJason Evans 				    cb->uaiocb._aiocb_private.status;
1421bfbbc4aaSJason Evans 			} else
142211783b14SJohn Dyson 				p->p_retval[0] = EFAULT;
1423fd3bf775SJohn Dyson 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1424bfbbc4aaSJason Evans 				curproc->p_stats->p_ru.ru_oublock +=
1425bfbbc4aaSJason Evans 				    cb->outputcharge;
1426fd3bf775SJohn Dyson 				cb->outputcharge = 0;
1427fd3bf775SJohn Dyson 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1428bfbbc4aaSJason Evans 				curproc->p_stats->p_ru.ru_inblock +=
1429bfbbc4aaSJason Evans 				    cb->inputcharge;
1430fd3bf775SJohn Dyson 				cb->inputcharge = 0;
1431fd3bf775SJohn Dyson 			}
14322244ea07SJohn Dyson 			aio_free_entry(cb);
14332244ea07SJohn Dyson 			return 0;
14342244ea07SJohn Dyson 		}
14352244ea07SJohn Dyson 	}
1436bfbbc4aaSJason Evans 	splx(s);
14372244ea07SJohn Dyson 
143884af4da6SJohn Dyson 	s = splbio();
1439bfbbc4aaSJason Evans 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
144084af4da6SJohn Dyson 		ncb = TAILQ_NEXT(cb, plist);
1441bfbbc4aaSJason Evans 		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1442bfbbc4aaSJason Evans 		    == jobref) {
144384af4da6SJohn Dyson 			splx(s);
144411783b14SJohn Dyson 			if (ujob == cb->uuaiocb) {
1445bfbbc4aaSJason Evans 				p->p_retval[0] =
1446bfbbc4aaSJason Evans 				    cb->uaiocb._aiocb_private.status;
1447bfbbc4aaSJason Evans 			} else
144811783b14SJohn Dyson 				p->p_retval[0] = EFAULT;
144984af4da6SJohn Dyson 			aio_free_entry(cb);
145084af4da6SJohn Dyson 			return 0;
145184af4da6SJohn Dyson 		}
145284af4da6SJohn Dyson 	}
145384af4da6SJohn Dyson 	splx(s);
145484af4da6SJohn Dyson 
14552244ea07SJohn Dyson 	return (EINVAL);
1456dd85920aSJason Evans #endif /* VFS_AIO */
14572244ea07SJohn Dyson }
14582244ea07SJohn Dyson 
14592244ea07SJohn Dyson /*
1460bfbbc4aaSJason Evans  * Allow a process to wakeup when any of the I/O requests are completed.
14612244ea07SJohn Dyson  */
14622244ea07SJohn Dyson int
1463fd3bf775SJohn Dyson aio_suspend(struct proc *p, struct aio_suspend_args *uap)
1464fd3bf775SJohn Dyson {
1465dd85920aSJason Evans #ifndef VFS_AIO
1466dd85920aSJason Evans 	return ENOSYS;
1467dd85920aSJason Evans #else
14684a11ca4eSPoul-Henning Kamp 	struct timeval atv;
14692244ea07SJohn Dyson 	struct timespec ts;
14702244ea07SJohn Dyson 	struct aiocb *const *cbptr, *cbp;
14712244ea07SJohn Dyson 	struct kaioinfo *ki;
14722244ea07SJohn Dyson 	struct aiocblist *cb;
14732244ea07SJohn Dyson 	int i;
147484af4da6SJohn Dyson 	int njoblist;
14752244ea07SJohn Dyson 	int error, s, timo;
147611783b14SJohn Dyson 	int *ijoblist;
147711783b14SJohn Dyson 	struct aiocb **ujoblist;
14782244ea07SJohn Dyson 
1479fd3bf775SJohn Dyson 	if (uap->nent >= AIO_LISTIO_MAX)
1480fd3bf775SJohn Dyson 		return EINVAL;
14812244ea07SJohn Dyson 
14822244ea07SJohn Dyson 	timo = 0;
14832244ea07SJohn Dyson 	if (uap->timeout) {
1484bfbbc4aaSJason Evans 		/* Get timespec struct. */
1485bfbbc4aaSJason Evans 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
14862244ea07SJohn Dyson 			return error;
14872244ea07SJohn Dyson 
14882244ea07SJohn Dyson 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
14892244ea07SJohn Dyson 			return (EINVAL);
14902244ea07SJohn Dyson 
1491e3b3ba2dSDag-Erling Smørgrav 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
14922244ea07SJohn Dyson 		if (itimerfix(&atv))
14932244ea07SJohn Dyson 			return (EINVAL);
1494227ee8a1SPoul-Henning Kamp 		timo = tvtohz(&atv);
14952244ea07SJohn Dyson 	}
14962244ea07SJohn Dyson 
14972244ea07SJohn Dyson 	ki = p->p_aioinfo;
14982244ea07SJohn Dyson 	if (ki == NULL)
14992244ea07SJohn Dyson 		return EAGAIN;
15002244ea07SJohn Dyson 
150184af4da6SJohn Dyson 	njoblist = 0;
150211783b14SJohn Dyson 	ijoblist = zalloc(aiol_zone);
150311783b14SJohn Dyson 	ujoblist = zalloc(aiol_zone);
15042244ea07SJohn Dyson 	cbptr = uap->aiocbp;
15052244ea07SJohn Dyson 
15062244ea07SJohn Dyson 	for (i = 0; i < uap->nent; i++) {
150730166fabSBruce Evans 		cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
150884af4da6SJohn Dyson 		if (cbp == 0)
150984af4da6SJohn Dyson 			continue;
151011783b14SJohn Dyson 		ujoblist[njoblist] = cbp;
151111783b14SJohn Dyson 		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
151284af4da6SJohn Dyson 		njoblist++;
15132244ea07SJohn Dyson 	}
1514bfbbc4aaSJason Evans 
151511783b14SJohn Dyson 	if (njoblist == 0) {
151611783b14SJohn Dyson 		zfree(aiol_zone, ijoblist);
151711783b14SJohn Dyson 		zfree(aiol_zone, ujoblist);
151884af4da6SJohn Dyson 		return 0;
151911783b14SJohn Dyson 	}
15202244ea07SJohn Dyson 
152111783b14SJohn Dyson 	error = 0;
1522bfbbc4aaSJason Evans 	for (;;) {
1523bfbbc4aaSJason Evans 		for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb =
1524bfbbc4aaSJason Evans 		    TAILQ_NEXT(cb, plist)) {
152584af4da6SJohn Dyson 			for (i = 0; i < njoblist; i++) {
1526bfbbc4aaSJason Evans 				if (((intptr_t)
1527bfbbc4aaSJason Evans 				    cb->uaiocb._aiocb_private.kernelinfo) ==
152811783b14SJohn Dyson 				    ijoblist[i]) {
152911783b14SJohn Dyson 					if (ujoblist[i] != cb->uuaiocb)
153011783b14SJohn Dyson 						error = EINVAL;
153111783b14SJohn Dyson 					zfree(aiol_zone, ijoblist);
153211783b14SJohn Dyson 					zfree(aiol_zone, ujoblist);
153311783b14SJohn Dyson 					return error;
153484af4da6SJohn Dyson 				}
153584af4da6SJohn Dyson 			}
153684af4da6SJohn Dyson 		}
153784af4da6SJohn Dyson 
153884af4da6SJohn Dyson 		s = splbio();
1539bfbbc4aaSJason Evans 		for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1540bfbbc4aaSJason Evans 		    TAILQ_NEXT(cb, plist)) {
154184af4da6SJohn Dyson 			for (i = 0; i < njoblist; i++) {
1542bfbbc4aaSJason Evans 				if (((intptr_t)
1543bfbbc4aaSJason Evans 				    cb->uaiocb._aiocb_private.kernelinfo) ==
154411783b14SJohn Dyson 				    ijoblist[i]) {
154584af4da6SJohn Dyson 					splx(s);
154611783b14SJohn Dyson 					if (ujoblist[i] != cb->uuaiocb)
154711783b14SJohn Dyson 						error = EINVAL;
154811783b14SJohn Dyson 					zfree(aiol_zone, ijoblist);
154911783b14SJohn Dyson 					zfree(aiol_zone, ujoblist);
155011783b14SJohn Dyson 					return error;
15512244ea07SJohn Dyson 				}
15522244ea07SJohn Dyson 			}
15532244ea07SJohn Dyson 		}
15542244ea07SJohn Dyson 
1555fd3bf775SJohn Dyson 		ki->kaio_flags |= KAIO_WAKEUP;
15562244ea07SJohn Dyson 		error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1557b7592c7bSJason Evans 		splx(s);
15582244ea07SJohn Dyson 
155960ffb019SJason Evans 		if (error == ERESTART || error == EINTR) {
156011783b14SJohn Dyson 			zfree(aiol_zone, ijoblist);
156111783b14SJohn Dyson 			zfree(aiol_zone, ujoblist);
15622244ea07SJohn Dyson 			return EINTR;
15632244ea07SJohn Dyson 		} else if (error == EWOULDBLOCK) {
156411783b14SJohn Dyson 			zfree(aiol_zone, ijoblist);
156511783b14SJohn Dyson 			zfree(aiol_zone, ujoblist);
15662244ea07SJohn Dyson 			return EAGAIN;
15672244ea07SJohn Dyson 		}
15682244ea07SJohn Dyson 	}
15692244ea07SJohn Dyson 
15702244ea07SJohn Dyson /* NOTREACHED */
15712244ea07SJohn Dyson 	return EINVAL;
1572dd85920aSJason Evans #endif /* VFS_AIO */
15732244ea07SJohn Dyson }
1574ee877a35SJohn Dyson 
1575ee877a35SJohn Dyson /*
1576dd85920aSJason Evans  * aio_cancel cancels any non-physio aio operations not currently in
1577dd85920aSJason Evans  * progress.
1578ee877a35SJohn Dyson  */
1579ee877a35SJohn Dyson int
1580fd3bf775SJohn Dyson aio_cancel(struct proc *p, struct aio_cancel_args *uap)
1581fd3bf775SJohn Dyson {
1582dd85920aSJason Evans #ifndef VFS_AIO
158378922e41SJohn Dyson 	return ENOSYS;
1584dd85920aSJason Evans #else
1585dd85920aSJason Evans 	struct kaioinfo *ki;
1586dd85920aSJason Evans 	struct aiocblist *cbe, *cbn;
1587dd85920aSJason Evans 	struct file *fp;
1588dd85920aSJason Evans 	struct filedesc *fdp;
1589dd85920aSJason Evans 	struct socket *so;
1590dd85920aSJason Evans 	struct proc *po;
1591dd85920aSJason Evans 	int s,error;
1592dd85920aSJason Evans 	int cancelled=0;
1593dd85920aSJason Evans 	int notcancelled=0;
1594dd85920aSJason Evans 	struct vnode *vp;
1595dd85920aSJason Evans 
1596dd85920aSJason Evans 	fdp = p->p_fd;
1597dd85920aSJason Evans 
1598dd85920aSJason Evans 	fp = fdp->fd_ofiles[uap->fd];
1599dd85920aSJason Evans 
1600dd85920aSJason Evans 	if (fp == NULL) {
1601dd85920aSJason Evans 		return EBADF;
1602dd85920aSJason Evans 	}
1603dd85920aSJason Evans 
1604dd85920aSJason Evans         if (fp->f_type == DTYPE_VNODE) {
1605dd85920aSJason Evans 		vp = (struct vnode *)fp->f_data;
1606dd85920aSJason Evans 
1607dd85920aSJason Evans 		if (vn_isdisk(vp,&error)) {
1608dd85920aSJason Evans 			p->p_retval[0] = AIO_NOTCANCELED;
1609dd85920aSJason Evans         	        return 0;
1610dd85920aSJason Evans 		}
1611dd85920aSJason Evans 	} else if (fp->f_type == DTYPE_SOCKET) {
1612dd85920aSJason Evans 		so = (struct socket *)fp->f_data;
1613dd85920aSJason Evans 
1614dd85920aSJason Evans 		s = splnet();
1615dd85920aSJason Evans 
1616dd85920aSJason Evans 		for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1617dd85920aSJason Evans 			cbn = TAILQ_NEXT(cbe, list);
1618dd85920aSJason Evans 			if ((uap->aiocbp == NULL) ||
1619dd85920aSJason Evans 				(uap->aiocbp == cbe->uuaiocb) ) {
1620dd85920aSJason Evans 				po = cbe->userproc;
1621dd85920aSJason Evans 				ki = po->p_aioinfo;
1622dd85920aSJason Evans 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1623dd85920aSJason Evans 				TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1624dd85920aSJason Evans 				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1625dd85920aSJason Evans 				if (ki->kaio_flags & KAIO_WAKEUP) {
1626dd85920aSJason Evans 					wakeup(po);
1627dd85920aSJason Evans 				}
1628dd85920aSJason Evans 				cbe->jobstate = JOBST_JOBFINISHED;
1629dd85920aSJason Evans 				cbe->uaiocb._aiocb_private.status=-1;
1630dd85920aSJason Evans 				cbe->uaiocb._aiocb_private.error=ECANCELED;
1631dd85920aSJason Evans 				cancelled++;
1632dd85920aSJason Evans 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1633dd85920aSJason Evans 				    SIGEV_SIGNAL)
1634dd85920aSJason Evans 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1635dd85920aSJason Evans 				if (uap->aiocbp)
1636dd85920aSJason Evans 					break;
1637dd85920aSJason Evans 			}
1638dd85920aSJason Evans 		}
1639dd85920aSJason Evans 
1640dd85920aSJason Evans 		splx(s);
1641dd85920aSJason Evans 
1642dd85920aSJason Evans 		if ((cancelled) && (uap->aiocbp)) {
1643dd85920aSJason Evans 			p->p_retval[0] = AIO_CANCELED;
1644dd85920aSJason Evans 			return 0;
1645dd85920aSJason Evans 		}
1646dd85920aSJason Evans 
1647dd85920aSJason Evans 	}
1648dd85920aSJason Evans 
1649dd85920aSJason Evans 	ki=p->p_aioinfo;
1650dd85920aSJason Evans 
1651dd85920aSJason Evans 	s = splnet();
1652dd85920aSJason Evans 
1653dd85920aSJason Evans 	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1654dd85920aSJason Evans 		cbn = TAILQ_NEXT(cbe, plist);
1655dd85920aSJason Evans 
1656dd85920aSJason Evans 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1657dd85920aSJason Evans 		    ((uap->aiocbp == NULL ) ||
1658dd85920aSJason Evans 		     (uap->aiocbp == cbe->uuaiocb))) {
1659dd85920aSJason Evans 
1660dd85920aSJason Evans 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1661dd85920aSJason Evans 				TAILQ_REMOVE(&aio_jobs, cbe, list);
1662dd85920aSJason Evans                                 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1663dd85920aSJason Evans                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1664dd85920aSJason Evans                                     plist);
1665dd85920aSJason Evans 				cancelled++;
1666dd85920aSJason Evans 				ki->kaio_queue_finished_count++;
1667dd85920aSJason Evans 				cbe->jobstate = JOBST_JOBFINISHED;
1668dd85920aSJason Evans 				cbe->uaiocb._aiocb_private.status = -1;
1669dd85920aSJason Evans 				cbe->uaiocb._aiocb_private.error = ECANCELED;
1670dd85920aSJason Evans 			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1671dd85920aSJason Evans 				    SIGEV_SIGNAL)
1672dd85920aSJason Evans 					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1673dd85920aSJason Evans 			} else {
1674dd85920aSJason Evans 				notcancelled++;
1675dd85920aSJason Evans 			}
1676dd85920aSJason Evans 		}
1677dd85920aSJason Evans 	}
1678dd85920aSJason Evans 
1679dd85920aSJason Evans 	splx(s);
1680dd85920aSJason Evans 
1681dd85920aSJason Evans 
1682dd85920aSJason Evans 	if (notcancelled) {
1683dd85920aSJason Evans 		p->p_retval[0] = AIO_NOTCANCELED;
1684dd85920aSJason Evans 		return 0;
1685dd85920aSJason Evans 	}
1686dd85920aSJason Evans 
1687dd85920aSJason Evans 	if (cancelled) {
1688dd85920aSJason Evans 		p->p_retval[0] = AIO_CANCELED;
1689dd85920aSJason Evans 		return 0;
1690dd85920aSJason Evans 	}
1691dd85920aSJason Evans 
1692dd85920aSJason Evans 	p->p_retval[0] = AIO_ALLDONE;
1693dd85920aSJason Evans 
1694dd85920aSJason Evans 	return 0;
1695dd85920aSJason Evans #endif /* VFS_AIO */
1696ee877a35SJohn Dyson }
1697ee877a35SJohn Dyson 
1698ee877a35SJohn Dyson /*
1699bfbbc4aaSJason Evans  * aio_error is implemented in the kernel level for compatibility purposes only.
1700bfbbc4aaSJason Evans  * For a user mode async implementation, it would be best to do it in a userland
1701bfbbc4aaSJason Evans  * subroutine.
1702ee877a35SJohn Dyson  */
1703ee877a35SJohn Dyson int
1704fd3bf775SJohn Dyson aio_error(struct proc *p, struct aio_error_args *uap)
1705fd3bf775SJohn Dyson {
1706dd85920aSJason Evans #ifndef VFS_AIO
1707dd85920aSJason Evans 	return ENOSYS;
1708dd85920aSJason Evans #else
170984af4da6SJohn Dyson 	int s;
17102244ea07SJohn Dyson 	struct aiocblist *cb;
17112244ea07SJohn Dyson 	struct kaioinfo *ki;
17122244ea07SJohn Dyson 	int jobref;
1713ee877a35SJohn Dyson 
17142244ea07SJohn Dyson 	ki = p->p_aioinfo;
17152244ea07SJohn Dyson 	if (ki == NULL)
17162244ea07SJohn Dyson 		return EINVAL;
17172244ea07SJohn Dyson 
17182244ea07SJohn Dyson 	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1719fd3bf775SJohn Dyson 	if ((jobref == -1) || (jobref == 0))
1720fd3bf775SJohn Dyson 		return EINVAL;
1721ee877a35SJohn Dyson 
1722bfbbc4aaSJason Evans 	for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1723bfbbc4aaSJason Evans 	    plist)) {
1724bfbbc4aaSJason Evans 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1725bfbbc4aaSJason Evans 		    jobref) {
1726cb226aaaSPoul-Henning Kamp 			p->p_retval[0] = cb->uaiocb._aiocb_private.error;
17272244ea07SJohn Dyson 			return 0;
17282244ea07SJohn Dyson 		}
1729ee877a35SJohn Dyson 	}
1730ee877a35SJohn Dyson 
1731bfbbc4aaSJason Evans 	s = splnet();
17322244ea07SJohn Dyson 
1733bfbbc4aaSJason Evans 	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1734bfbbc4aaSJason Evans 	    plist)) {
1735bfbbc4aaSJason Evans 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1736bfbbc4aaSJason Evans 		    jobref) {
1737cb226aaaSPoul-Henning Kamp 			p->p_retval[0] = EINPROGRESS;
1738bfbbc4aaSJason Evans 			splx(s);
17392244ea07SJohn Dyson 			return 0;
17402244ea07SJohn Dyson 		}
17412244ea07SJohn Dyson 	}
174260ffb019SJason Evans 
174360ffb019SJason Evans 	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
174460ffb019SJason Evans 	    plist)) {
174560ffb019SJason Evans 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
174660ffb019SJason Evans 		    jobref) {
174760ffb019SJason Evans 			p->p_retval[0] = EINPROGRESS;
174860ffb019SJason Evans 			splx(s);
174960ffb019SJason Evans 			return 0;
175060ffb019SJason Evans 		}
175160ffb019SJason Evans 	}
1752bfbbc4aaSJason Evans 	splx(s);
17532244ea07SJohn Dyson 
175484af4da6SJohn Dyson 	s = splbio();
1755bfbbc4aaSJason Evans 	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1756bfbbc4aaSJason Evans 	    plist)) {
1757bfbbc4aaSJason Evans 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1758bfbbc4aaSJason Evans 		    jobref) {
175984af4da6SJohn Dyson 			p->p_retval[0] = cb->uaiocb._aiocb_private.error;
176084af4da6SJohn Dyson 			splx(s);
176184af4da6SJohn Dyson 			return 0;
176284af4da6SJohn Dyson 		}
176384af4da6SJohn Dyson 	}
176484af4da6SJohn Dyson 
1765bfbbc4aaSJason Evans 	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1766bfbbc4aaSJason Evans 	    plist)) {
1767bfbbc4aaSJason Evans 		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1768bfbbc4aaSJason Evans 		    jobref) {
176984af4da6SJohn Dyson 			p->p_retval[0] = EINPROGRESS;
177084af4da6SJohn Dyson 			splx(s);
177184af4da6SJohn Dyson 			return 0;
177284af4da6SJohn Dyson 		}
177384af4da6SJohn Dyson 	}
177484af4da6SJohn Dyson 	splx(s);
177584af4da6SJohn Dyson 
1776bfbbc4aaSJason Evans #if (0)
17772244ea07SJohn Dyson 	/*
1778bfbbc4aaSJason Evans 	 * Hack for lio.
17792244ea07SJohn Dyson 	 */
17802244ea07SJohn Dyson 	status = fuword(&uap->aiocbp->_aiocb_private.status);
1781bfbbc4aaSJason Evans 	if (status == -1)
17822244ea07SJohn Dyson 		return fuword(&uap->aiocbp->_aiocb_private.error);
1783bfbbc4aaSJason Evans #endif
17842244ea07SJohn Dyson 	return EINVAL;
1785dd85920aSJason Evans #endif /* VFS_AIO */
1786ee877a35SJohn Dyson }
1787ee877a35SJohn Dyson 
1788ee877a35SJohn Dyson int
1789fd3bf775SJohn Dyson aio_read(struct proc *p, struct aio_read_args *uap)
1790fd3bf775SJohn Dyson {
1791dd85920aSJason Evans #ifndef VFS_AIO
1792dd85920aSJason Evans 	return ENOSYS;
1793dd85920aSJason Evans #else
1794ee877a35SJohn Dyson 	struct filedesc *fdp;
1795ee877a35SJohn Dyson 	struct file *fp;
1796ee877a35SJohn Dyson 	struct uio auio;
1797ee877a35SJohn Dyson 	struct iovec aiov;
1798ee877a35SJohn Dyson 	unsigned int fd;
1799ee877a35SJohn Dyson 	int cnt;
1800ee877a35SJohn Dyson 	struct aiocb iocb;
18012244ea07SJohn Dyson 	int error, pmodes;
1802ee877a35SJohn Dyson 
18032244ea07SJohn Dyson 	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1804bfbbc4aaSJason Evans 	if ((pmodes & AIO_PMODE_SYNC) == 0)
18052244ea07SJohn Dyson 		return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ);
1806ee877a35SJohn Dyson 
1807bfbbc4aaSJason Evans 	/* Get control block. */
1808bfbbc4aaSJason Evans 	if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb))
1809bfbbc4aaSJason Evans 	    != 0)
1810ee877a35SJohn Dyson 		return error;
1811ee877a35SJohn Dyson 
1812bfbbc4aaSJason Evans 	/* Get the fd info for process. */
1813ee877a35SJohn Dyson 	fdp = p->p_fd;
1814ee877a35SJohn Dyson 
1815ee877a35SJohn Dyson 	/*
1816bfbbc4aaSJason Evans 	 * Range check file descriptor.
1817ee877a35SJohn Dyson 	 */
1818ee877a35SJohn Dyson 	fd = iocb.aio_fildes;
1819ee877a35SJohn Dyson 	if (fd >= fdp->fd_nfiles)
1820ee877a35SJohn Dyson 		return EBADF;
1821ee877a35SJohn Dyson 	fp = fdp->fd_ofiles[fd];
1822ee877a35SJohn Dyson 	if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
1823ee877a35SJohn Dyson 		return EBADF;
18242244ea07SJohn Dyson 	if (iocb.aio_offset == -1LL)
1825ee877a35SJohn Dyson 		return EINVAL;
1826ee877a35SJohn Dyson 
1827ee877a35SJohn Dyson 	auio.uio_resid = iocb.aio_nbytes;
1828ee877a35SJohn Dyson 	if (auio.uio_resid < 0)
1829ee877a35SJohn Dyson 		return (EINVAL);
1830ee877a35SJohn Dyson 
18312244ea07SJohn Dyson 	/*
18322244ea07SJohn Dyson 	 * Process sync simply -- queue async request.
18332244ea07SJohn Dyson 	 */
1834bfbbc4aaSJason Evans 	if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0)
18352244ea07SJohn Dyson 		return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ);
18362244ea07SJohn Dyson 
183764889941SJohn Dyson 	aiov.iov_base = (void *)iocb.aio_buf;
18382244ea07SJohn Dyson 	aiov.iov_len = iocb.aio_nbytes;
18392244ea07SJohn Dyson 
18402244ea07SJohn Dyson 	auio.uio_iov = &aiov;
18412244ea07SJohn Dyson 	auio.uio_iovcnt = 1;
18422244ea07SJohn Dyson 	auio.uio_offset = iocb.aio_offset;
1843ee877a35SJohn Dyson 	auio.uio_rw = UIO_READ;
1844ee877a35SJohn Dyson 	auio.uio_segflg = UIO_USERSPACE;
1845ee877a35SJohn Dyson 	auio.uio_procp = p;
1846ee877a35SJohn Dyson 
1847ee877a35SJohn Dyson 	cnt = iocb.aio_nbytes;
184813ccadd4SBrian Feldman 	error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p);
1849bfbbc4aaSJason Evans 	if (error && (auio.uio_resid != cnt) && (error == ERESTART || error ==
1850bfbbc4aaSJason Evans 	    EINTR || error == EWOULDBLOCK))
1851ee877a35SJohn Dyson 		error = 0;
1852ee877a35SJohn Dyson 	cnt -= auio.uio_resid;
1853cb226aaaSPoul-Henning Kamp 	p->p_retval[0] = cnt;
1854ee877a35SJohn Dyson 	return error;
1855dd85920aSJason Evans #endif /* VFS_AIO */
1856ee877a35SJohn Dyson }
1857ee877a35SJohn Dyson 
1858ee877a35SJohn Dyson int
1859fd3bf775SJohn Dyson aio_write(struct proc *p, struct aio_write_args *uap)
1860fd3bf775SJohn Dyson {
1861dd85920aSJason Evans #ifndef VFS_AIO
1862dd85920aSJason Evans 	return ENOSYS;
1863dd85920aSJason Evans #else
1864ee877a35SJohn Dyson 	struct filedesc *fdp;
1865ee877a35SJohn Dyson 	struct file *fp;
1866ee877a35SJohn Dyson 	struct uio auio;
1867ee877a35SJohn Dyson 	struct iovec aiov;
1868ee877a35SJohn Dyson 	unsigned int fd;
1869ee877a35SJohn Dyson 	int cnt;
1870ee877a35SJohn Dyson 	struct aiocb iocb;
1871ee877a35SJohn Dyson 	int error;
18722244ea07SJohn Dyson 	int pmodes;
18732244ea07SJohn Dyson 
18742244ea07SJohn Dyson 	/*
18752244ea07SJohn Dyson 	 * Process sync simply -- queue async request.
18762244ea07SJohn Dyson 	 */
18772244ea07SJohn Dyson 	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1878bfbbc4aaSJason Evans 	if ((pmodes & AIO_PMODE_SYNC) == 0)
18792244ea07SJohn Dyson 		return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_WRITE);
1880ee877a35SJohn Dyson 
1881bfbbc4aaSJason Evans 	if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb))
1882bfbbc4aaSJason Evans 	    != 0)
1883ee877a35SJohn Dyson 		return error;
1884ee877a35SJohn Dyson 
1885bfbbc4aaSJason Evans 	/* Get the fd info for process. */
1886ee877a35SJohn Dyson 	fdp = p->p_fd;
1887ee877a35SJohn Dyson 
1888ee877a35SJohn Dyson 	/*
1889bfbbc4aaSJason Evans 	 * Range check file descriptor.
1890ee877a35SJohn Dyson 	 */
1891ee877a35SJohn Dyson 	fd = iocb.aio_fildes;
1892ee877a35SJohn Dyson 	if (fd >= fdp->fd_nfiles)
1893ee877a35SJohn Dyson 		return EBADF;
1894ee877a35SJohn Dyson 	fp = fdp->fd_ofiles[fd];
1895ee877a35SJohn Dyson 	if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
1896ee877a35SJohn Dyson 		return EBADF;
18972244ea07SJohn Dyson 	if (iocb.aio_offset == -1LL)
1898ee877a35SJohn Dyson 		return EINVAL;
1899ee877a35SJohn Dyson 
190064889941SJohn Dyson 	aiov.iov_base = (void *)iocb.aio_buf;
1901ee877a35SJohn Dyson 	aiov.iov_len = iocb.aio_nbytes;
1902ee877a35SJohn Dyson 	auio.uio_iov = &aiov;
1903ee877a35SJohn Dyson 	auio.uio_iovcnt = 1;
1904ee877a35SJohn Dyson 	auio.uio_offset = iocb.aio_offset;
1905ee877a35SJohn Dyson 
1906ee877a35SJohn Dyson 	auio.uio_resid = iocb.aio_nbytes;
1907ee877a35SJohn Dyson 	if (auio.uio_resid < 0)
1908ee877a35SJohn Dyson 		return (EINVAL);
1909ee877a35SJohn Dyson 
1910ee877a35SJohn Dyson 	auio.uio_rw = UIO_WRITE;
1911ee877a35SJohn Dyson 	auio.uio_segflg = UIO_USERSPACE;
1912ee877a35SJohn Dyson 	auio.uio_procp = p;
1913ee877a35SJohn Dyson 
1914ee877a35SJohn Dyson 	cnt = iocb.aio_nbytes;
191513ccadd4SBrian Feldman 	error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p);
1916ee877a35SJohn Dyson 	if (error) {
1917ee877a35SJohn Dyson 		if (auio.uio_resid != cnt) {
1918bfbbc4aaSJason Evans 			if (error == ERESTART || error == EINTR || error ==
1919bfbbc4aaSJason Evans 			    EWOULDBLOCK)
1920ee877a35SJohn Dyson 				error = 0;
1921ee877a35SJohn Dyson 			if (error == EPIPE)
1922ee877a35SJohn Dyson 				psignal(p, SIGPIPE);
1923ee877a35SJohn Dyson 		}
1924ee877a35SJohn Dyson 	}
1925ee877a35SJohn Dyson 	cnt -= auio.uio_resid;
1926cb226aaaSPoul-Henning Kamp 	p->p_retval[0] = cnt;
1927ee877a35SJohn Dyson 	return error;
1928dd85920aSJason Evans #endif /* VFS_AIO */
1929ee877a35SJohn Dyson }
1930ee877a35SJohn Dyson 
1931ee877a35SJohn Dyson int
1932fd3bf775SJohn Dyson lio_listio(struct proc *p, struct lio_listio_args *uap)
1933fd3bf775SJohn Dyson {
1934dd85920aSJason Evans #ifndef VFS_AIO
1935dd85920aSJason Evans 	return ENOSYS;
1936dd85920aSJason Evans #else
19374a11ca4eSPoul-Henning Kamp 	int nent, nentqueued;
19382244ea07SJohn Dyson 	struct aiocb *iocb, * const *cbptr;
19392244ea07SJohn Dyson 	struct aiocblist *cb;
19402244ea07SJohn Dyson 	struct kaioinfo *ki;
194184af4da6SJohn Dyson 	struct aio_liojob *lj;
19422244ea07SJohn Dyson 	int error, runningcode;
1943fd3bf775SJohn Dyson 	int nerror;
1944ee877a35SJohn Dyson 	int i;
194584af4da6SJohn Dyson 	int s;
1946ee877a35SJohn Dyson 
1947bfbbc4aaSJason Evans 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1948ee877a35SJohn Dyson 		return EINVAL;
19492244ea07SJohn Dyson 
19502244ea07SJohn Dyson 	nent = uap->nent;
1951bfbbc4aaSJason Evans 	if (nent > AIO_LISTIO_MAX)
19522244ea07SJohn Dyson 		return EINVAL;
19532244ea07SJohn Dyson 
1954bfbbc4aaSJason Evans 	if (p->p_aioinfo == NULL)
19552244ea07SJohn Dyson 		aio_init_aioinfo(p);
19562244ea07SJohn Dyson 
1957bfbbc4aaSJason Evans 	if ((nent + num_queue_count) > max_queue_count)
19582244ea07SJohn Dyson 		return EAGAIN;
19592244ea07SJohn Dyson 
19602244ea07SJohn Dyson 	ki = p->p_aioinfo;
1961bfbbc4aaSJason Evans 	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
19622244ea07SJohn Dyson 		return EAGAIN;
19632244ea07SJohn Dyson 
196484af4da6SJohn Dyson 	lj = zalloc(aiolio_zone);
1965bfbbc4aaSJason Evans 	if (!lj)
196684af4da6SJohn Dyson 		return EAGAIN;
196784af4da6SJohn Dyson 
196884af4da6SJohn Dyson 	lj->lioj_flags = 0;
196984af4da6SJohn Dyson 	lj->lioj_buffer_count = 0;
197084af4da6SJohn Dyson 	lj->lioj_buffer_finished_count = 0;
197184af4da6SJohn Dyson 	lj->lioj_queue_count = 0;
197284af4da6SJohn Dyson 	lj->lioj_queue_finished_count = 0;
197384af4da6SJohn Dyson 	lj->lioj_ki = ki;
197484af4da6SJohn Dyson 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
197584af4da6SJohn Dyson 
197684af4da6SJohn Dyson 	/*
1977bfbbc4aaSJason Evans 	 * Setup signal.
197884af4da6SJohn Dyson 	 */
197984af4da6SJohn Dyson 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1980bfbbc4aaSJason Evans 		error = copyin(uap->sig, &lj->lioj_signal,
1981bfbbc4aaSJason Evans 		    sizeof(lj->lioj_signal));
198284af4da6SJohn Dyson 		if (error)
198384af4da6SJohn Dyson 			return error;
198484af4da6SJohn Dyson 		lj->lioj_flags |= LIOJ_SIGNAL;
198584af4da6SJohn Dyson 		lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
1986bfbbc4aaSJason Evans 	} else
198784af4da6SJohn Dyson 		lj->lioj_flags &= ~LIOJ_SIGNAL;
198884af4da6SJohn Dyson 
19892244ea07SJohn Dyson 	/*
1990bfbbc4aaSJason Evans 	 * Get pointers to the list of I/O requests.
19912244ea07SJohn Dyson 	 */
1992fd3bf775SJohn Dyson 	nerror = 0;
1993fd3bf775SJohn Dyson 	nentqueued = 0;
19942244ea07SJohn Dyson 	cbptr = uap->acb_list;
19952244ea07SJohn Dyson 	for (i = 0; i < uap->nent; i++) {
199630166fabSBruce Evans 		iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
199730166fabSBruce Evans 		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
199884af4da6SJohn Dyson 			error = _aio_aqueue(p, iocb, lj, 0);
1999bfbbc4aaSJason Evans 			if (error == 0)
20002244ea07SJohn Dyson 				nentqueued++;
2001bfbbc4aaSJason Evans 			else
2002fd3bf775SJohn Dyson 				nerror++;
2003fd3bf775SJohn Dyson 		}
2004fd3bf775SJohn Dyson 	}
20052244ea07SJohn Dyson 
2006a624e84fSJohn Dyson 	/*
2007bfbbc4aaSJason Evans 	 * If we haven't queued any, then just return error.
2008a624e84fSJohn Dyson 	 */
2009bfbbc4aaSJason Evans 	if (nentqueued == 0)
2010fd3bf775SJohn Dyson 		return 0;
20112244ea07SJohn Dyson 
2012a624e84fSJohn Dyson 	/*
2013bfbbc4aaSJason Evans 	 * Calculate the appropriate error return.
2014a624e84fSJohn Dyson 	 */
20152244ea07SJohn Dyson 	runningcode = 0;
2016fd3bf775SJohn Dyson 	if (nerror)
20172244ea07SJohn Dyson 		runningcode = EIO;
20182244ea07SJohn Dyson 
20192244ea07SJohn Dyson 	if (uap->mode == LIO_WAIT) {
2020bfbbc4aaSJason Evans 		int command, found, jobref;
2021bfbbc4aaSJason Evans 
2022bfbbc4aaSJason Evans 		for (;;) {
2023fd3bf775SJohn Dyson 			found = 0;
2024fd3bf775SJohn Dyson 			for (i = 0; i < uap->nent; i++) {
2025a624e84fSJohn Dyson 				/*
2026bfbbc4aaSJason Evans 				 * Fetch address of the control buf pointer in
2027bfbbc4aaSJason Evans 				 * user space.
2028a624e84fSJohn Dyson 				 */
202930166fabSBruce Evans 				iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2030bfbbc4aaSJason Evans 				if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2031bfbbc4aaSJason Evans 				    == 0))
2032fd3bf775SJohn Dyson 					continue;
2033a624e84fSJohn Dyson 
2034a624e84fSJohn Dyson 				/*
2035bfbbc4aaSJason Evans 				 * Fetch the associated command from user space.
2036a624e84fSJohn Dyson 				 */
20372244ea07SJohn Dyson 				command = fuword(&iocb->aio_lio_opcode);
2038fd3bf775SJohn Dyson 				if (command == LIO_NOP) {
2039fd3bf775SJohn Dyson 					found++;
20402244ea07SJohn Dyson 					continue;
2041fd3bf775SJohn Dyson 				}
2042a624e84fSJohn Dyson 
20432244ea07SJohn Dyson 				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
20442244ea07SJohn Dyson 
2045bfbbc4aaSJason Evans 				for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb;
20462244ea07SJohn Dyson 				    cb = TAILQ_NEXT(cb, plist)) {
2047bfbbc4aaSJason Evans 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2048bfbbc4aaSJason Evans 					    == jobref) {
2049bfbbc4aaSJason Evans 						if (cb->uaiocb.aio_lio_opcode
2050bfbbc4aaSJason Evans 						    == LIO_WRITE) {
2051bfbbc4aaSJason Evans 							curproc->p_stats->p_ru.ru_oublock
2052bfbbc4aaSJason Evans 							    +=
205384af4da6SJohn Dyson 							    cb->outputcharge;
205484af4da6SJohn Dyson 							cb->outputcharge = 0;
2055bfbbc4aaSJason Evans 						} else if (cb->uaiocb.aio_lio_opcode
2056bfbbc4aaSJason Evans 						    == LIO_READ) {
2057bfbbc4aaSJason Evans 							curproc->p_stats->p_ru.ru_inblock
2058bfbbc4aaSJason Evans 							    += cb->inputcharge;
205984af4da6SJohn Dyson 							cb->inputcharge = 0;
206084af4da6SJohn Dyson 						}
20612244ea07SJohn Dyson 						found++;
20622244ea07SJohn Dyson 						break;
20632244ea07SJohn Dyson 					}
20642244ea07SJohn Dyson 				}
2065fd3bf775SJohn Dyson 
206684af4da6SJohn Dyson 				s = splbio();
2067bfbbc4aaSJason Evans 				for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb;
206884af4da6SJohn Dyson 				    cb = TAILQ_NEXT(cb, plist)) {
2069bfbbc4aaSJason Evans 					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2070bfbbc4aaSJason Evans 					    == jobref) {
207184af4da6SJohn Dyson 						found++;
207284af4da6SJohn Dyson 						break;
2073fd3bf775SJohn Dyson 					}
20742244ea07SJohn Dyson 				}
207584af4da6SJohn Dyson 				splx(s);
207684af4da6SJohn Dyson 			}
20772244ea07SJohn Dyson 
2078a624e84fSJohn Dyson 			/*
2079bfbbc4aaSJason Evans 			 * If all I/Os have been disposed of, then we can
2080bfbbc4aaSJason Evans 			 * return.
2081a624e84fSJohn Dyson 			 */
2082bfbbc4aaSJason Evans 			if (found == nentqueued)
20832244ea07SJohn Dyson 				return runningcode;
20842244ea07SJohn Dyson 
2085fd3bf775SJohn Dyson 			ki->kaio_flags |= KAIO_WAKEUP;
20862244ea07SJohn Dyson 			error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
20872244ea07SJohn Dyson 
2088bfbbc4aaSJason Evans 			if (error == EINTR)
20892244ea07SJohn Dyson 				return EINTR;
2090bfbbc4aaSJason Evans 			else if (error == EWOULDBLOCK)
20912244ea07SJohn Dyson 				return EAGAIN;
20922244ea07SJohn Dyson 		}
20932244ea07SJohn Dyson 	}
20942244ea07SJohn Dyson 
20952244ea07SJohn Dyson 	return runningcode;
2096dd85920aSJason Evans #endif /* VFS_AIO */
2097ee877a35SJohn Dyson }
2098fd3bf775SJohn Dyson 
209984af4da6SJohn Dyson /*
2100bfbbc4aaSJason Evans  * This is a wierd hack so that we can post a signal.  It is safe to do so from
2101bfbbc4aaSJason Evans  * a timeout routine, but *not* from an interrupt routine.
210284af4da6SJohn Dyson  */
210384af4da6SJohn Dyson static void
2104bfbbc4aaSJason Evans process_signal(void *aioj)
210584af4da6SJohn Dyson {
2106bfbbc4aaSJason Evans 	struct aiocblist *aiocbe = aioj;
2107bfbbc4aaSJason Evans 	struct aio_liojob *lj = aiocbe->lio;
2108bfbbc4aaSJason Evans 	struct aiocb *cb = &aiocbe->uaiocb;
2109bfbbc4aaSJason Evans 
2110bfbbc4aaSJason Evans 	if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2111bfbbc4aaSJason Evans 	    (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
211284af4da6SJohn Dyson 		psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
211384af4da6SJohn Dyson 		lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
211484af4da6SJohn Dyson 	}
2115bfbbc4aaSJason Evans 
2116bfbbc4aaSJason Evans 	if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2117bfbbc4aaSJason Evans 		psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
211884af4da6SJohn Dyson }
211984af4da6SJohn Dyson 
212084af4da6SJohn Dyson /*
2121bfbbc4aaSJason Evans  * Interrupt handler for physio, performs the necessary process wakeups, and
2122bfbbc4aaSJason Evans  * signals.
212384af4da6SJohn Dyson  */
2124fd3bf775SJohn Dyson static void
2125bfbbc4aaSJason Evans aio_physwakeup(struct buf *bp)
2126fd3bf775SJohn Dyson {
212784af4da6SJohn Dyson 	struct aiocblist *aiocbe;
2128fd3bf775SJohn Dyson 	struct proc *p;
2129fd3bf775SJohn Dyson 	struct kaioinfo *ki;
213084af4da6SJohn Dyson 	struct aio_liojob *lj;
213111783b14SJohn Dyson 	int s;
213211783b14SJohn Dyson 	s = splbio();
2133fd3bf775SJohn Dyson 
2134fd3bf775SJohn Dyson 	wakeup((caddr_t)bp);
2135fd3bf775SJohn Dyson 	bp->b_flags &= ~B_CALL;
213684af4da6SJohn Dyson 	bp->b_flags |= B_DONE;
2137fd3bf775SJohn Dyson 
213884af4da6SJohn Dyson 	aiocbe = (struct aiocblist *)bp->b_spc;
213984af4da6SJohn Dyson 	if (aiocbe) {
2140b0eeea20SPoul-Henning Kamp 		p = bp->b_caller1;
214184af4da6SJohn Dyson 
214284af4da6SJohn Dyson 		aiocbe->jobstate = JOBST_JOBBFINISHED;
214384af4da6SJohn Dyson 		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
214484af4da6SJohn Dyson 		aiocbe->uaiocb._aiocb_private.error = 0;
214584af4da6SJohn Dyson 		aiocbe->jobflags |= AIOCBLIST_DONE;
214684af4da6SJohn Dyson 
2147bfbbc4aaSJason Evans 		if (bp->b_flags & B_ERROR)
214884af4da6SJohn Dyson 			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
214984af4da6SJohn Dyson 
215084af4da6SJohn Dyson 		lj = aiocbe->lio;
215184af4da6SJohn Dyson 		if (lj) {
215284af4da6SJohn Dyson 			lj->lioj_buffer_finished_count++;
2153bfbbc4aaSJason Evans 
215484af4da6SJohn Dyson 			/*
2155bfbbc4aaSJason Evans 			 * wakeup/signal if all of the interrupt jobs are done.
215684af4da6SJohn Dyson 			 */
2157bfbbc4aaSJason Evans 			if (lj->lioj_buffer_finished_count ==
2158bfbbc4aaSJason Evans 			    lj->lioj_buffer_count) {
215984af4da6SJohn Dyson 				/*
2160bfbbc4aaSJason Evans 				 * Post a signal if it is called for.
216184af4da6SJohn Dyson 				 */
2162bfbbc4aaSJason Evans 				if ((lj->lioj_flags &
2163bfbbc4aaSJason Evans 				    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
216484af4da6SJohn Dyson 				    LIOJ_SIGNAL) {
216584af4da6SJohn Dyson 					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2166bfbbc4aaSJason Evans 					timeout(process_signal, aiocbe, 0);
216784af4da6SJohn Dyson 				}
216884af4da6SJohn Dyson 			}
216984af4da6SJohn Dyson 		}
217084af4da6SJohn Dyson 
2171fd3bf775SJohn Dyson 		ki = p->p_aioinfo;
217284af4da6SJohn Dyson 		if (ki) {
217384af4da6SJohn Dyson 			ki->kaio_buffer_finished_count++;
217484af4da6SJohn Dyson 			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
217584af4da6SJohn Dyson 			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
217684af4da6SJohn Dyson 			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2177bfbbc4aaSJason Evans 
2178bfbbc4aaSJason Evans 			/* Do the wakeup. */
217984af4da6SJohn Dyson 			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2180fd3bf775SJohn Dyson 				ki->kaio_flags &= ~KAIO_WAKEUP;
2181fd3bf775SJohn Dyson 				wakeup(p);
2182fd3bf775SJohn Dyson 			}
2183fd3bf775SJohn Dyson 		}
2184bfbbc4aaSJason Evans 
2185bfbbc4aaSJason Evans 		if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2186bfbbc4aaSJason Evans 			timeout(process_signal, aiocbe, 0);
2187fd3bf775SJohn Dyson 	}
218811783b14SJohn Dyson 	splx(s);
218984af4da6SJohn Dyson }
2190bfbbc4aaSJason Evans 
2191bfbbc4aaSJason Evans int
2192bfbbc4aaSJason Evans aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap)
2193bfbbc4aaSJason Evans {
2194dd85920aSJason Evans #ifndef VFS_AIO
2195dd85920aSJason Evans 	return ENOSYS;
2196dd85920aSJason Evans #else
2197bfbbc4aaSJason Evans 	struct timeval atv;
2198bfbbc4aaSJason Evans 	struct timespec ts;
2199bfbbc4aaSJason Evans 	struct aiocb **cbptr;
2200bfbbc4aaSJason Evans 	struct kaioinfo *ki;
2201bfbbc4aaSJason Evans 	struct aiocblist *cb = NULL;
2202bfbbc4aaSJason Evans 	int error, s, timo;
2203bfbbc4aaSJason Evans 
2204dd85920aSJason Evans 	suword(uap->aiocbp, (int)NULL);
2205dd85920aSJason Evans 
2206bfbbc4aaSJason Evans 	timo = 0;
2207bfbbc4aaSJason Evans 	if (uap->timeout) {
2208bfbbc4aaSJason Evans 		/* Get timespec struct. */
2209bfbbc4aaSJason Evans 		error = copyin((caddr_t)uap->timeout, (caddr_t)&ts,
2210bfbbc4aaSJason Evans 		    sizeof(ts));
2211bfbbc4aaSJason Evans 		if (error)
2212bfbbc4aaSJason Evans 			return error;
2213bfbbc4aaSJason Evans 
2214bfbbc4aaSJason Evans 		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2215bfbbc4aaSJason Evans 			return (EINVAL);
2216bfbbc4aaSJason Evans 
2217bfbbc4aaSJason Evans 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
2218bfbbc4aaSJason Evans 		if (itimerfix(&atv))
2219bfbbc4aaSJason Evans 			return (EINVAL);
2220bfbbc4aaSJason Evans 		timo = tvtohz(&atv);
2221bfbbc4aaSJason Evans 	}
2222bfbbc4aaSJason Evans 
2223bfbbc4aaSJason Evans 	ki = p->p_aioinfo;
2224bfbbc4aaSJason Evans 	if (ki == NULL)
2225bfbbc4aaSJason Evans 		return EAGAIN;
2226bfbbc4aaSJason Evans 
2227bfbbc4aaSJason Evans 	cbptr = uap->aiocbp;
2228bfbbc4aaSJason Evans 
2229bfbbc4aaSJason Evans 	for (;;) {
2230bfbbc4aaSJason Evans 		if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2231bfbbc4aaSJason Evans 			suword(uap->aiocbp, (int)cb->uuaiocb);
2232bfbbc4aaSJason Evans 			p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2233bfbbc4aaSJason Evans 			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2234bfbbc4aaSJason Evans 				curproc->p_stats->p_ru.ru_oublock +=
2235bfbbc4aaSJason Evans 				    cb->outputcharge;
2236bfbbc4aaSJason Evans 				cb->outputcharge = 0;
2237bfbbc4aaSJason Evans 			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2238bfbbc4aaSJason Evans 				curproc->p_stats->p_ru.ru_inblock +=
2239bfbbc4aaSJason Evans 				    cb->inputcharge;
2240bfbbc4aaSJason Evans 				cb->inputcharge = 0;
2241bfbbc4aaSJason Evans 			}
2242bfbbc4aaSJason Evans 			aio_free_entry(cb);
2243dd85920aSJason Evans 			return cb->uaiocb._aiocb_private.error;
2244bfbbc4aaSJason Evans 		}
2245bfbbc4aaSJason Evans 
2246bfbbc4aaSJason Evans 		s = splbio();
2247bfbbc4aaSJason Evans  		if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2248bfbbc4aaSJason Evans 			splx(s);
2249bfbbc4aaSJason Evans 			suword(uap->aiocbp, (int)cb->uuaiocb);
2250bfbbc4aaSJason Evans 			p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2251bfbbc4aaSJason Evans 			aio_free_entry(cb);
2252dd85920aSJason Evans 			return cb->uaiocb._aiocb_private.error;
2253bfbbc4aaSJason Evans 		}
2254bfbbc4aaSJason Evans 
2255bfbbc4aaSJason Evans 		ki->kaio_flags |= KAIO_WAKEUP;
2256bfbbc4aaSJason Evans 		error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2257dd85920aSJason Evans 		splx(s);
2258bfbbc4aaSJason Evans 
2259dd85920aSJason Evans 		if (error == ERESTART)
2260dd85920aSJason Evans 			return EINTR;
2261dd85920aSJason Evans 		else if (error < 0)
2262bfbbc4aaSJason Evans 			return error;
2263bfbbc4aaSJason Evans 		else if (error == EINTR)
2264bfbbc4aaSJason Evans 			return EINTR;
2265bfbbc4aaSJason Evans 		else if (error == EWOULDBLOCK)
2266bfbbc4aaSJason Evans 			return EAGAIN;
2267bfbbc4aaSJason Evans 	}
2268dd85920aSJason Evans #endif /* VFS_AIO */
2269bfbbc4aaSJason Evans }
2270