xref: /titanic_41/usr/src/uts/common/os/aio.c (revision 34f9b3eef6fdadbda0a846aa4d68691ac40eace5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Kernel asynchronous I/O.
29  * This is only for raw devices now (as of Nov. 1993).
30  */
31 
32 #include <sys/types.h>
33 #include <sys/errno.h>
34 #include <sys/conf.h>
35 #include <sys/file.h>
36 #include <sys/fs/snode.h>
37 #include <sys/unistd.h>
38 #include <sys/cmn_err.h>
39 #include <vm/as.h>
40 #include <vm/faultcode.h>
41 #include <sys/sysmacros.h>
42 #include <sys/procfs.h>
43 #include <sys/kmem.h>
44 #include <sys/autoconf.h>
45 #include <sys/ddi_impldefs.h>
46 #include <sys/sunddi.h>
47 #include <sys/aio_impl.h>
48 #include <sys/debug.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/vmsystm.h>
52 #include <sys/fs/pxfs_ki.h>
53 #include <sys/contract/process_impl.h>
54 
55 /*
56  * external entry point.
57  */
58 #ifdef _LP64
59 static int64_t kaioc(long, long, long, long, long, long);
60 #endif
61 static int kaio(ulong_t *, rval_t *);
62 
63 
64 #define	AIO_64	0
65 #define	AIO_32	1
66 #define	AIO_LARGEFILE	2
67 
68 /*
69  * implementation specific functions (private)
70  */
71 #ifdef _LP64
72 static int alio(int, aiocb_t **, int, struct sigevent *);
73 #endif
74 static int aionotify(void);
75 static int aioinit(void);
76 static int aiostart(void);
77 static void alio_cleanup(aio_t *, aiocb_t **, int, int);
78 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
79     cred_t *);
80 static void lio_set_error(aio_req_t *, int portused);
81 static aio_t *aio_aiop_alloc();
82 static int aio_req_alloc(aio_req_t **, aio_result_t *);
83 static int aio_lio_alloc(aio_lio_t **);
84 static aio_req_t *aio_req_done(void *);
85 static aio_req_t *aio_req_remove(aio_req_t *);
86 static int aio_req_find(aio_result_t *, aio_req_t **);
87 static int aio_hash_insert(struct aio_req_t *, aio_t *);
88 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
89     aio_result_t *, vnode_t *, int);
90 static int aio_cleanup_thread(aio_t *);
91 static aio_lio_t *aio_list_get(aio_result_t *);
92 static void lio_set_uerror(void *, int);
93 extern void aio_zerolen(aio_req_t *);
94 static int aiowait(struct timeval *, int, long	*);
95 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
96 static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
97     aio_req_t *reqlist, aio_t *aiop, model_t model);
98 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
99 static int aiosuspend(void *, int, struct  timespec *, int,
100     long	*, int);
101 static int aliowait(int, void *, int, void *, int);
102 static int aioerror(void *, int);
103 static int aio_cancel(int, void *, long	*, int);
104 static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
105 static int aiorw(int, void *, int, int);
106 
107 static int alioLF(int, void *, int, void *);
108 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *,
109     aio_result_t *, vnode_t *, int);
110 static int alio32(int, void *, int, void *);
111 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
112 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
113 
114 #ifdef  _SYSCALL32_IMPL
115 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
116 void	aiocb_32ton(aiocb32_t *, aiocb_t *);
117 #endif /* _SYSCALL32_IMPL */
118 
119 /*
120  * implementation specific functions (external)
121  */
122 void aio_req_free(aio_t *, aio_req_t *);
123 
124 /*
125  * Event Port framework
126  */
127 
128 void aio_req_free_port(aio_t *, aio_req_t *);
129 static int aio_port_callback(void *, int *, pid_t, int, void *);
130 
131 /*
132  * This is the loadable module wrapper.
133  */
134 #include <sys/modctl.h>
135 #include <sys/syscall.h>
136 
137 #ifdef _LP64
138 
139 static struct sysent kaio_sysent = {
140 	6,
141 	SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
142 	(int (*)())kaioc
143 };
144 
145 #ifdef _SYSCALL32_IMPL
146 static struct sysent kaio_sysent32 = {
147 	7,
148 	SE_NOUNLOAD | SE_64RVAL,
149 	kaio
150 };
151 #endif  /* _SYSCALL32_IMPL */
152 
153 #else   /* _LP64 */
154 
155 static struct sysent kaio_sysent = {
156 	7,
157 	SE_NOUNLOAD | SE_32RVAL1,
158 	kaio
159 };
160 
161 #endif  /* _LP64 */
162 
163 /*
164  * Module linkage information for the kernel.
165  */
166 
167 static struct modlsys modlsys = {
168 	&mod_syscallops,
169 	"kernel Async I/O",
170 	&kaio_sysent
171 };
172 
173 #ifdef  _SYSCALL32_IMPL
174 static struct modlsys modlsys32 = {
175 	&mod_syscallops32,
176 	"kernel Async I/O for 32 bit compatibility",
177 	&kaio_sysent32
178 };
179 #endif  /* _SYSCALL32_IMPL */
180 
181 
182 static struct modlinkage modlinkage = {
183 	MODREV_1,
184 	&modlsys,
185 #ifdef  _SYSCALL32_IMPL
186 	&modlsys32,
187 #endif
188 	NULL
189 };
190 
191 int
192 _init(void)
193 {
194 	int retval;
195 
196 	if ((retval = mod_install(&modlinkage)) != 0)
197 		return (retval);
198 
199 	return (0);
200 }
201 
202 int
203 _fini(void)
204 {
205 	int retval;
206 
207 	retval = mod_remove(&modlinkage);
208 
209 	return (retval);
210 }
211 
212 int
213 _info(struct modinfo *modinfop)
214 {
215 	return (mod_info(&modlinkage, modinfop));
216 }
217 
218 #ifdef	_LP64
219 static int64_t
220 kaioc(
221 	long	a0,
222 	long	a1,
223 	long	a2,
224 	long	a3,
225 	long	a4,
226 	long	a5)
227 {
228 	int	error;
229 	long	rval = 0;
230 
231 	switch ((int)a0 & ~AIO_POLL_BIT) {
232 	case AIOREAD:
233 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
234 		    (offset_t)a4, (aio_result_t *)a5, FREAD);
235 		break;
236 	case AIOWRITE:
237 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
238 		    (offset_t)a4, (aio_result_t *)a5, FWRITE);
239 		break;
240 	case AIOWAIT:
241 		error = aiowait((struct timeval *)a1, (int)a2, &rval);
242 		break;
243 	case AIOWAITN:
244 		error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
245 		    (timespec_t *)a4);
246 		break;
247 	case AIONOTIFY:
248 		error = aionotify();
249 		break;
250 	case AIOINIT:
251 		error = aioinit();
252 		break;
253 	case AIOSTART:
254 		error = aiostart();
255 		break;
256 	case AIOLIO:
257 		error = alio((int)a1, (aiocb_t **)a2, (int)a3,
258 		    (struct sigevent *)a4);
259 		break;
260 	case AIOLIOWAIT:
261 		error = aliowait((int)a1, (void *)a2, (int)a3,
262 		    (struct sigevent *)a4, AIO_64);
263 		break;
264 	case AIOSUSPEND:
265 		error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
266 		    (int)a4, &rval, AIO_64);
267 		break;
268 	case AIOERROR:
269 		error = aioerror((void *)a1, AIO_64);
270 		break;
271 	case AIOAREAD:
272 		error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
273 		break;
274 	case AIOAWRITE:
275 		error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
276 		break;
277 	case AIOCANCEL:
278 		error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
279 		break;
280 
281 	/*
282 	 * The large file related stuff is valid only for
283 	 * 32 bit kernel and not for 64 bit kernel
284 	 * On 64 bit kernel we convert large file calls
285 	 * to regular 64bit calls.
286 	 */
287 
288 	default:
289 		error = EINVAL;
290 	}
291 	if (error)
292 		return ((int64_t)set_errno(error));
293 	return (rval);
294 }
295 #endif
296 
297 static int
298 kaio(
299 	ulong_t *uap,
300 	rval_t *rvp)
301 {
302 	long rval = 0;
303 	int	error = 0;
304 	offset_t	off;
305 
306 
307 		rvp->r_vals = 0;
308 #if defined(_LITTLE_ENDIAN)
309 	off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
310 #else
311 	off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
312 #endif
313 
314 	switch (uap[0] & ~AIO_POLL_BIT) {
315 	/*
316 	 * It must be the 32 bit system call on 64 bit kernel
317 	 */
318 	case AIOREAD:
319 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
320 		    (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
321 	case AIOWRITE:
322 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
323 		    (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
324 	case AIOWAIT:
325 		error = aiowait((struct	timeval *)uap[1], (int)uap[2],
326 		    &rval);
327 		break;
328 	case AIOWAITN:
329 		error = aiowaitn((void *)uap[1], (uint_t)uap[2],
330 		    (uint_t *)uap[3], (timespec_t *)uap[4]);
331 		break;
332 	case AIONOTIFY:
333 		return (aionotify());
334 	case AIOINIT:
335 		return (aioinit());
336 	case AIOSTART:
337 		return (aiostart());
338 	case AIOLIO:
339 		return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
340 		    (void *)uap[4]));
341 	case AIOLIOWAIT:
342 		return (aliowait((int)uap[1], (void *)uap[2],
343 		    (int)uap[3], (struct sigevent *)uap[4], AIO_32));
344 	case AIOSUSPEND:
345 		error = aiosuspend((void *)uap[1], (int)uap[2],
346 		    (timespec_t *)uap[3], (int)uap[4],
347 		    &rval, AIO_32);
348 		break;
349 	case AIOERROR:
350 		return (aioerror((void *)uap[1], AIO_32));
351 	case AIOAREAD:
352 		return (aiorw((int)uap[0], (void *)uap[1],
353 		    FREAD, AIO_32));
354 	case AIOAWRITE:
355 		return (aiorw((int)uap[0], (void *)uap[1],
356 		    FWRITE, AIO_32));
357 	case AIOCANCEL:
358 		error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
359 		    AIO_32));
360 		break;
361 	case AIOLIO64:
362 		return (alioLF((int)uap[1], (void *)uap[2],
363 		    (int)uap[3], (void *)uap[4]));
364 	case AIOLIOWAIT64:
365 		return (aliowait(uap[1], (void *)uap[2],
366 		    (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
367 	case AIOSUSPEND64:
368 		error = aiosuspend((void *)uap[1], (int)uap[2],
369 		    (timespec_t *)uap[3], (int)uap[4], &rval,
370 		    AIO_LARGEFILE);
371 		break;
372 	case AIOERROR64:
373 		return (aioerror((void *)uap[1], AIO_LARGEFILE));
374 	case AIOAREAD64:
375 		return (aiorw((int)uap[0], (void *)uap[1], FREAD,
376 		    AIO_LARGEFILE));
377 	case AIOAWRITE64:
378 		return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
379 		    AIO_LARGEFILE));
380 	case AIOCANCEL64:
381 		error = (aio_cancel((int)uap[1], (void *)uap[2],
382 		    &rval, AIO_LARGEFILE));
383 		break;
384 	default:
385 		return (EINVAL);
386 	}
387 
388 	rvp->r_val1 = rval;
389 	return (error);
390 }
391 
392 /*
393  * wake up LWPs in this process that are sleeping in
394  * aiowait().
395  */
396 static int
397 aionotify(void)
398 {
399 	aio_t	*aiop;
400 
401 	aiop = curproc->p_aio;
402 	if (aiop == NULL)
403 		return (0);
404 
405 	mutex_enter(&aiop->aio_mutex);
406 	aiop->aio_notifycnt++;
407 	cv_broadcast(&aiop->aio_waitcv);
408 	mutex_exit(&aiop->aio_mutex);
409 
410 	return (0);
411 }
412 
413 static int
414 timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
415 	timestruc_t **rqtp, int *blocking)
416 {
417 #ifdef	_SYSCALL32_IMPL
418 	struct timeval32 wait_time_32;
419 #endif
420 	struct timeval wait_time;
421 	model_t	model = get_udatamodel();
422 
423 	*rqtp = NULL;
424 	if (timout == NULL) {		/* wait indefinitely */
425 		*blocking = 1;
426 		return (0);
427 	}
428 
429 	/*
430 	 * Need to correctly compare with the -1 passed in for a user
431 	 * address pointer, with both 32 bit and 64 bit apps.
432 	 */
433 	if (model == DATAMODEL_NATIVE) {
434 		if ((intptr_t)timout == (intptr_t)-1) {	/* don't wait */
435 			*blocking = 0;
436 			return (0);
437 		}
438 
439 		if (copyin(timout, &wait_time, sizeof (wait_time)))
440 			return (EFAULT);
441 	}
442 #ifdef	_SYSCALL32_IMPL
443 	else {
444 		/*
445 		 * -1 from a 32bit app. It will not get sign extended.
446 		 * don't wait if -1.
447 		 */
448 		if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
449 			*blocking = 0;
450 			return (0);
451 		}
452 
453 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
454 			return (EFAULT);
455 		TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
456 	}
457 #endif  /* _SYSCALL32_IMPL */
458 
459 	if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) {	/* don't wait */
460 		*blocking = 0;
461 		return (0);
462 	}
463 
464 	if (wait_time.tv_sec < 0 ||
465 	    wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
466 		return (EINVAL);
467 
468 	rqtime->tv_sec = wait_time.tv_sec;
469 	rqtime->tv_nsec = wait_time.tv_usec * 1000;
470 	*rqtp = rqtime;
471 	*blocking = 1;
472 
473 	return (0);
474 }
475 
476 static int
477 timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
478 	timestruc_t **rqtp, int *blocking)
479 {
480 #ifdef	_SYSCALL32_IMPL
481 	timespec32_t wait_time_32;
482 #endif
483 	model_t	model = get_udatamodel();
484 
485 	*rqtp = NULL;
486 	if (timout == NULL) {
487 		*blocking = 1;
488 		return (0);
489 	}
490 
491 	if (model == DATAMODEL_NATIVE) {
492 		if (copyin(timout, rqtime, sizeof (*rqtime)))
493 			return (EFAULT);
494 	}
495 #ifdef	_SYSCALL32_IMPL
496 	else {
497 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
498 			return (EFAULT);
499 		TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
500 	}
501 #endif  /* _SYSCALL32_IMPL */
502 
503 	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
504 		*blocking = 0;
505 		return (0);
506 	}
507 
508 	if (rqtime->tv_sec < 0 ||
509 	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
510 		return (EINVAL);
511 
512 	*rqtp = rqtime;
513 	*blocking = 1;
514 
515 	return (0);
516 }
517 
518 /*ARGSUSED*/
519 static int
520 aiowait(
521 	struct timeval	*timout,
522 	int	dontblockflg,
523 	long	*rval)
524 {
525 	int 		error;
526 	aio_t		*aiop;
527 	aio_req_t	*reqp;
528 	clock_t		status;
529 	int		blocking;
530 	int		timecheck;
531 	timestruc_t	rqtime;
532 	timestruc_t	*rqtp;
533 
534 	aiop = curproc->p_aio;
535 	if (aiop == NULL)
536 		return (EINVAL);
537 
538 	/*
539 	 * Establish the absolute future time for the timeout.
540 	 */
541 	error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
542 	if (error)
543 		return (error);
544 	if (rqtp) {
545 		timestruc_t now;
546 		timecheck = timechanged;
547 		gethrestime(&now);
548 		timespecadd(rqtp, &now);
549 	}
550 
551 	mutex_enter(&aiop->aio_mutex);
552 	for (;;) {
553 		/* process requests on poll queue */
554 		if (aiop->aio_pollq) {
555 			mutex_exit(&aiop->aio_mutex);
556 			aio_cleanup(0);
557 			mutex_enter(&aiop->aio_mutex);
558 		}
559 		if ((reqp = aio_req_remove(NULL)) != NULL) {
560 			*rval = (long)reqp->aio_req_resultp;
561 			break;
562 		}
563 		/* user-level done queue might not be empty */
564 		if (aiop->aio_notifycnt > 0) {
565 			aiop->aio_notifycnt--;
566 			*rval = 1;
567 			break;
568 		}
569 		/* don't block if no outstanding aio */
570 		if (aiop->aio_outstanding == 0 && dontblockflg) {
571 			error = EINVAL;
572 			break;
573 		}
574 		if (blocking) {
575 			status = cv_waituntil_sig(&aiop->aio_waitcv,
576 			    &aiop->aio_mutex, rqtp, timecheck);
577 
578 			if (status > 0)		/* check done queue again */
579 				continue;
580 			if (status == 0) {	/* interrupted by a signal */
581 				error = EINTR;
582 				*rval = -1;
583 			} else {		/* timer expired */
584 				error = ETIME;
585 			}
586 		}
587 		break;
588 	}
589 	mutex_exit(&aiop->aio_mutex);
590 	if (reqp) {
591 		aphysio_unlock(reqp);
592 		aio_copyout_result(reqp);
593 		mutex_enter(&aiop->aio_mutex);
594 		aio_req_free(aiop, reqp);
595 		mutex_exit(&aiop->aio_mutex);
596 	}
597 	return (error);
598 }
599 
600 /*
601  * aiowaitn can be used to reap completed asynchronous requests submitted with
602  * lio_listio, aio_read or aio_write.
603  * This function only reaps asynchronous raw I/Os.
604  */
605 
606 /*ARGSUSED*/
607 static int
608 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
609 {
610 	int 		error = 0;
611 	aio_t		*aiop;
612 	aio_req_t	*reqlist = NULL;
613 	caddr_t		iocblist = NULL;	/* array of iocb ptr's */
614 	uint_t		waitcnt, cnt = 0;	/* iocb cnt */
615 	size_t		iocbsz;			/* users iocb size */
616 	size_t		riocbsz;		/* returned iocb size */
617 	int		iocb_index = 0;
618 	model_t		model = get_udatamodel();
619 	int		blocking = 1;
620 	int		timecheck;
621 	timestruc_t	rqtime;
622 	timestruc_t	*rqtp;
623 
624 	aiop = curproc->p_aio;
625 	if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX)
626 		return (EINVAL);
627 
628 	if (aiop->aio_outstanding == 0)
629 		return (EAGAIN);
630 
631 	if (copyin(nwait, &waitcnt, sizeof (uint_t)))
632 		return (EFAULT);
633 
634 	/* set *nwait to zero, if we must return prematurely */
635 	if (copyout(&cnt, nwait, sizeof (uint_t)))
636 		return (EFAULT);
637 
638 	if (waitcnt == 0) {
639 		blocking = 0;
640 		rqtp = NULL;
641 		waitcnt = nent;
642 	} else {
643 		error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
644 		if (error)
645 			return (error);
646 	}
647 
648 	if (model == DATAMODEL_NATIVE)
649 		iocbsz = (sizeof (aiocb_t *) * nent);
650 #ifdef	_SYSCALL32_IMPL
651 	else
652 		iocbsz = (sizeof (caddr32_t) * nent);
653 #endif  /* _SYSCALL32_IMPL */
654 
655 	/*
656 	 * Only one aio_waitn call is allowed at a time.
657 	 * The active aio_waitn will collect all requests
658 	 * out of the "done" list and if necessary it will wait
659 	 * for some/all pending requests to fulfill the nwait
660 	 * parameter.
661 	 * A second or further aio_waitn calls will sleep here
662 	 * until the active aio_waitn finishes and leaves the kernel
663 	 * If the second call does not block (poll), then return
664 	 * immediately with the error code : EAGAIN.
665 	 * If the second call should block, then sleep here, but
666 	 * do not touch the timeout. The timeout starts when this
667 	 * aio_waitn-call becomes active.
668 	 */
669 
670 	mutex_enter(&aiop->aio_mutex);
671 
672 	while (aiop->aio_flags & AIO_WAITN) {
673 		if (blocking == 0) {
674 			mutex_exit(&aiop->aio_mutex);
675 			return (EAGAIN);
676 		}
677 
678 		/* block, no timeout */
679 		aiop->aio_flags |= AIO_WAITN_PENDING;
680 		if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
681 			mutex_exit(&aiop->aio_mutex);
682 			return (EINTR);
683 		}
684 	}
685 
686 	/*
687 	 * Establish the absolute future time for the timeout.
688 	 */
689 	if (rqtp) {
690 		timestruc_t now;
691 		timecheck = timechanged;
692 		gethrestime(&now);
693 		timespecadd(rqtp, &now);
694 	}
695 
696 	if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
697 		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
698 		aiop->aio_iocb = NULL;
699 	}
700 
701 	if (aiop->aio_iocb == NULL) {
702 		iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
703 		if (iocblist == NULL) {
704 			mutex_exit(&aiop->aio_mutex);
705 			return (ENOMEM);
706 		}
707 		aiop->aio_iocb = (aiocb_t **)iocblist;
708 		aiop->aio_iocbsz = iocbsz;
709 	} else {
710 		iocblist = (char *)aiop->aio_iocb;
711 	}
712 
713 	aiop->aio_waitncnt = waitcnt;
714 	aiop->aio_flags |= AIO_WAITN;
715 
716 	for (;;) {
717 		/* push requests on poll queue to done queue */
718 		if (aiop->aio_pollq) {
719 			mutex_exit(&aiop->aio_mutex);
720 			aio_cleanup(0);
721 			mutex_enter(&aiop->aio_mutex);
722 		}
723 
724 		/* check for requests on done queue */
725 		if (aiop->aio_doneq) {
726 			cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
727 			aiop->aio_waitncnt = waitcnt - cnt;
728 		}
729 
730 		/* user-level done queue might not be empty */
731 		if (aiop->aio_notifycnt > 0) {
732 			aiop->aio_notifycnt--;
733 			error = 0;
734 			break;
735 		}
736 
737 		/*
738 		 * if we are here second time as a result of timer
739 		 * expiration, we reset error if there are enough
740 		 * aiocb's to satisfy request.
741 		 * We return also if all requests are already done
742 		 * and we picked up the whole done queue.
743 		 */
744 
745 		if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
746 		    aiop->aio_doneq == NULL)) {
747 			error = 0;
748 			break;
749 		}
750 
751 		if ((cnt < waitcnt) && blocking) {
752 			int rval = cv_waituntil_sig(&aiop->aio_waitcv,
753 			    &aiop->aio_mutex, rqtp, timecheck);
754 			if (rval > 0)
755 				continue;
756 			if (rval < 0) {
757 				error = ETIME;
758 				blocking = 0;
759 				continue;
760 			}
761 			error = EINTR;
762 		}
763 		break;
764 	}
765 
766 	mutex_exit(&aiop->aio_mutex);
767 
768 	if (cnt > 0) {
769 
770 		iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
771 		    aiop, model);
772 
773 		if (model == DATAMODEL_NATIVE)
774 			riocbsz = (sizeof (aiocb_t *) * cnt);
775 #ifdef	_SYSCALL32_IMPL
776 		else
777 			riocbsz = (sizeof (caddr32_t) * cnt);
778 #endif  /* _SYSCALL32_IMPL */
779 
780 		if (copyout(iocblist, uiocb, riocbsz) ||
781 		    copyout(&cnt, nwait, sizeof (uint_t)))
782 			error = EFAULT;
783 	}
784 
785 	/* check if there is another thread waiting for execution */
786 	mutex_enter(&aiop->aio_mutex);
787 	aiop->aio_flags &= ~AIO_WAITN;
788 	if (aiop->aio_flags & AIO_WAITN_PENDING) {
789 		aiop->aio_flags &= ~AIO_WAITN_PENDING;
790 		cv_signal(&aiop->aio_waitncv);
791 	}
792 	mutex_exit(&aiop->aio_mutex);
793 
794 	return (error);
795 }
796 
797 /*
798  * aio_unlock_requests
799  * copyouts the result of the request as well as the return value.
800  * It builds the list of completed asynchronous requests,
801  * unlocks the allocated memory ranges and
802  * put the aio request structure back into the free list.
803  */
804 
805 static int
806 aio_unlock_requests(
807 	caddr_t	iocblist,
808 	int	iocb_index,
809 	aio_req_t *reqlist,
810 	aio_t	*aiop,
811 	model_t	model)
812 {
813 	aio_req_t	*reqp, *nreqp;
814 
815 	if (model == DATAMODEL_NATIVE) {
816 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
817 			(((caddr_t *)iocblist)[iocb_index++]) =
818 			    reqp->aio_req_iocb.iocb;
819 			nreqp = reqp->aio_req_next;
820 			aphysio_unlock(reqp);
821 			aio_copyout_result(reqp);
822 			mutex_enter(&aiop->aio_mutex);
823 			aio_req_free(aiop, reqp);
824 			mutex_exit(&aiop->aio_mutex);
825 		}
826 	}
827 #ifdef	_SYSCALL32_IMPL
828 	else {
829 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
830 			((caddr32_t *)iocblist)[iocb_index++] =
831 			    reqp->aio_req_iocb.iocb32;
832 			nreqp = reqp->aio_req_next;
833 			aphysio_unlock(reqp);
834 			aio_copyout_result(reqp);
835 			mutex_enter(&aiop->aio_mutex);
836 			aio_req_free(aiop, reqp);
837 			mutex_exit(&aiop->aio_mutex);
838 		}
839 	}
840 #endif	/* _SYSCALL32_IMPL */
841 	return (iocb_index);
842 }
843 
844 /*
845  * aio_reqlist_concat
846  * moves "max" elements from the done queue to the reqlist queue and removes
847  * the AIO_DONEQ flag.
848  * - reqlist queue is a simple linked list
849  * - done queue is a double linked list
850  */
851 
852 static int
853 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
854 {
855 	aio_req_t *q2, *q2work, *list;
856 	int count = 0;
857 
858 	list = *reqlist;
859 	q2 = aiop->aio_doneq;
860 	q2work = q2;
861 	while (max-- > 0) {
862 		q2work->aio_req_flags &= ~AIO_DONEQ;
863 		q2work = q2work->aio_req_next;
864 		count++;
865 		if (q2work == q2)
866 			break;
867 	}
868 
869 	if (q2work == q2) {
870 		/* all elements revised */
871 		q2->aio_req_prev->aio_req_next = list;
872 		list = q2;
873 		aiop->aio_doneq = NULL;
874 	} else {
875 		/*
876 		 * max < elements in the doneq
877 		 * detach only the required amount of elements
878 		 * out of the doneq
879 		 */
880 		q2work->aio_req_prev->aio_req_next = list;
881 		list = q2;
882 
883 		aiop->aio_doneq = q2work;
884 		q2work->aio_req_prev = q2->aio_req_prev;
885 		q2->aio_req_prev->aio_req_next = q2work;
886 	}
887 	*reqlist = list;
888 	return (count);
889 }
890 
891 /*ARGSUSED*/
892 static int
893 aiosuspend(
894 	void	*aiocb,
895 	int	nent,
896 	struct	timespec	*timout,
897 	int	flag,
898 	long	*rval,
899 	int	run_mode)
900 {
901 	int 		error;
902 	aio_t		*aiop;
903 	aio_req_t	*reqp, *found, *next;
904 	caddr_t		cbplist = NULL;
905 	aiocb_t		*cbp, **ucbp;
906 #ifdef	_SYSCALL32_IMPL
907 	aiocb32_t	*cbp32;
908 	caddr32_t	*ucbp32;
909 #endif  /* _SYSCALL32_IMPL */
910 	aiocb64_32_t	*cbp64;
911 	int		rv;
912 	int		i;
913 	size_t		ssize;
914 	model_t		model = get_udatamodel();
915 	int		blocking;
916 	int		timecheck;
917 	timestruc_t	rqtime;
918 	timestruc_t	*rqtp;
919 
920 	aiop = curproc->p_aio;
921 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
922 		return (EINVAL);
923 
924 	/*
925 	 * Establish the absolute future time for the timeout.
926 	 */
927 	error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
928 	if (error)
929 		return (error);
930 	if (rqtp) {
931 		timestruc_t now;
932 		timecheck = timechanged;
933 		gethrestime(&now);
934 		timespecadd(rqtp, &now);
935 	}
936 
937 	/*
938 	 * If we are not blocking and there's no IO complete
939 	 * skip aiocb copyin.
940 	 */
941 	if (!blocking && (aiop->aio_pollq == NULL) &&
942 	    (aiop->aio_doneq == NULL)) {
943 		return (EAGAIN);
944 	}
945 
946 	if (model == DATAMODEL_NATIVE)
947 		ssize = (sizeof (aiocb_t *) * nent);
948 #ifdef	_SYSCALL32_IMPL
949 	else
950 		ssize = (sizeof (caddr32_t) * nent);
951 #endif  /* _SYSCALL32_IMPL */
952 
953 	cbplist = kmem_alloc(ssize, KM_NOSLEEP);
954 	if (cbplist == NULL)
955 		return (ENOMEM);
956 
957 	if (copyin(aiocb, cbplist, ssize)) {
958 		error = EFAULT;
959 		goto done;
960 	}
961 
962 	found = NULL;
963 	/*
964 	 * we need to get the aio_cleanupq_mutex since we call
965 	 * aio_req_done().
966 	 */
967 	mutex_enter(&aiop->aio_cleanupq_mutex);
968 	mutex_enter(&aiop->aio_mutex);
969 	for (;;) {
970 		/* push requests on poll queue to done queue */
971 		if (aiop->aio_pollq) {
972 			mutex_exit(&aiop->aio_mutex);
973 			mutex_exit(&aiop->aio_cleanupq_mutex);
974 			aio_cleanup(0);
975 			mutex_enter(&aiop->aio_cleanupq_mutex);
976 			mutex_enter(&aiop->aio_mutex);
977 		}
978 		/* check for requests on done queue */
979 		if (aiop->aio_doneq) {
980 			if (model == DATAMODEL_NATIVE)
981 				ucbp = (aiocb_t **)cbplist;
982 #ifdef	_SYSCALL32_IMPL
983 			else
984 				ucbp32 = (caddr32_t *)cbplist;
985 #endif  /* _SYSCALL32_IMPL */
986 			for (i = 0; i < nent; i++) {
987 				if (model == DATAMODEL_NATIVE) {
988 					if ((cbp = *ucbp++) == NULL)
989 						continue;
990 					if (run_mode != AIO_LARGEFILE)
991 						reqp = aio_req_done(
992 						    &cbp->aio_resultp);
993 					else {
994 						cbp64 = (aiocb64_32_t *)cbp;
995 						reqp = aio_req_done(
996 						    &cbp64->aio_resultp);
997 					}
998 				}
999 #ifdef	_SYSCALL32_IMPL
1000 				else {
1001 					if (run_mode == AIO_32) {
1002 						if ((cbp32 =
1003 						    (aiocb32_t *)(uintptr_t)
1004 						    *ucbp32++) == NULL)
1005 							continue;
1006 						reqp = aio_req_done(
1007 						    &cbp32->aio_resultp);
1008 					} else if (run_mode == AIO_LARGEFILE) {
1009 						if ((cbp64 =
1010 						    (aiocb64_32_t *)(uintptr_t)
1011 						    *ucbp32++) == NULL)
1012 							continue;
1013 						reqp = aio_req_done(
1014 						    &cbp64->aio_resultp);
1015 					}
1016 
1017 				}
1018 #endif  /* _SYSCALL32_IMPL */
1019 				if (reqp) {
1020 					reqp->aio_req_next = found;
1021 					found = reqp;
1022 				}
1023 				if (aiop->aio_doneq == NULL)
1024 					break;
1025 			}
1026 			if (found)
1027 				break;
1028 		}
1029 		if (aiop->aio_notifycnt > 0) {
1030 			/*
1031 			 * nothing on the kernel's queue. the user
1032 			 * has notified the kernel that it has items
1033 			 * on a user-level queue.
1034 			 */
1035 			aiop->aio_notifycnt--;
1036 			*rval = 1;
1037 			error = 0;
1038 			break;
1039 		}
1040 		/* don't block if nothing is outstanding */
1041 		if (aiop->aio_outstanding == 0) {
1042 			error = EAGAIN;
1043 			break;
1044 		}
1045 		if (blocking) {
1046 			/*
1047 			 * drop the aio_cleanupq_mutex as we are
1048 			 * going to block.
1049 			 */
1050 			mutex_exit(&aiop->aio_cleanupq_mutex);
1051 			rv = cv_waituntil_sig(&aiop->aio_waitcv,
1052 			    &aiop->aio_mutex, rqtp, timecheck);
1053 			/*
1054 			 * we have to drop aio_mutex and
1055 			 * grab it in the right order.
1056 			 */
1057 			mutex_exit(&aiop->aio_mutex);
1058 			mutex_enter(&aiop->aio_cleanupq_mutex);
1059 			mutex_enter(&aiop->aio_mutex);
1060 			if (rv > 0)	/* check done queue again */
1061 				continue;
1062 			if (rv == 0)	/* interrupted by a signal */
1063 				error = EINTR;
1064 			else		/* timer expired */
1065 				error = ETIME;
1066 		} else {
1067 			error = EAGAIN;
1068 		}
1069 		break;
1070 	}
1071 	mutex_exit(&aiop->aio_mutex);
1072 	mutex_exit(&aiop->aio_cleanupq_mutex);
1073 	for (reqp = found; reqp != NULL; reqp = next) {
1074 		next = reqp->aio_req_next;
1075 		aphysio_unlock(reqp);
1076 		aio_copyout_result(reqp);
1077 		mutex_enter(&aiop->aio_mutex);
1078 		aio_req_free(aiop, reqp);
1079 		mutex_exit(&aiop->aio_mutex);
1080 	}
1081 done:
1082 	kmem_free(cbplist, ssize);
1083 	return (error);
1084 }
1085 
1086 /*
1087  * initialize aio by allocating an aio_t struct for this
1088  * process.
1089  */
1090 static int
1091 aioinit(void)
1092 {
1093 	proc_t *p = curproc;
1094 	aio_t *aiop;
1095 	mutex_enter(&p->p_lock);
1096 	if ((aiop = p->p_aio) == NULL) {
1097 		aiop = aio_aiop_alloc();
1098 		p->p_aio = aiop;
1099 	}
1100 	mutex_exit(&p->p_lock);
1101 	if (aiop == NULL)
1102 		return (ENOMEM);
1103 	return (0);
1104 }
1105 
1106 /*
1107  * start a special thread that will cleanup after aio requests
1108  * that are preventing a segment from being unmapped. as_unmap()
1109  * blocks until all phsyio to this segment is completed. this
1110  * doesn't happen until all the pages in this segment are not
1111  * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
1112  * requests still outstanding. this special thread will make sure
1113  * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
1114  *
1115  * this function will return an error if the process has only
1116  * one LWP. the assumption is that the caller is a separate LWP
1117  * that remains blocked in the kernel for the life of this process.
1118  */
1119 static int
1120 aiostart(void)
1121 {
1122 	proc_t *p = curproc;
1123 	aio_t *aiop;
1124 	int first, error = 0;
1125 
1126 	if (p->p_lwpcnt == 1)
1127 		return (EDEADLK);
1128 	mutex_enter(&p->p_lock);
1129 	if ((aiop = p->p_aio) == NULL)
1130 		error = EINVAL;
1131 	else {
1132 		first = aiop->aio_ok;
1133 		if (aiop->aio_ok == 0)
1134 			aiop->aio_ok = 1;
1135 	}
1136 	mutex_exit(&p->p_lock);
1137 	if (error == 0 && first == 0) {
1138 		return (aio_cleanup_thread(aiop));
1139 		/* should return only to exit */
1140 	}
1141 	return (error);
1142 }
1143 
1144 /*
1145  * Associate an aiocb with a port.
1146  * This function is used by aiorw() to associate a transaction with a port.
1147  * Allocate an event port structure (port_alloc_event()) and store the
1148  * delivered user pointer (portnfy_user) in the portkev_user field of the
1149  * port_kevent_t structure..
1150  * The aio_req_portkev pointer in the aio_req_t structure was added to identify
1151  * the port association.
1152  */
1153 
1154 static int
1155 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp,
1156 	aio_req_t *reqp, int event)
1157 {
1158 	port_kevent_t	*pkevp = NULL;
1159 	int		error;
1160 
1161 	error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
1162 	    PORT_SOURCE_AIO, &pkevp);
1163 	if (error) {
1164 		if ((error == ENOMEM) || (error == EAGAIN))
1165 			error = EAGAIN;
1166 		else
1167 			error = EINVAL;
1168 	} else {
1169 		port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
1170 		    aio_port_callback, reqp);
1171 		pkevp->portkev_events = event;
1172 		reqp->aio_req_portkev = pkevp;
1173 		reqp->aio_req_port = pntfy->portnfy_port;
1174 	}
1175 	return (error);
1176 }
1177 
1178 #ifdef _LP64
1179 
1180 /*
1181  * Asynchronous list IO. A chain of aiocb's are copied in
1182  * one at a time. If the aiocb is invalid, it is skipped.
1183  * For each aiocb, the appropriate driver entry point is
1184  * called. Optimize for the common case where the list
1185  * of requests is to the same file descriptor.
1186  *
1187  * One possible optimization is to define a new driver entry
1188  * point that supports a list of IO requests. Whether this
1189  * improves performance depends somewhat on the driver's
1190  * locking strategy. Processing a list could adversely impact
1191  * the driver's interrupt latency.
1192  */
1193 static int
1194 alio(
1195 	int		mode_arg,
1196 	aiocb_t		**aiocb_arg,
1197 	int		nent,
1198 	struct sigevent	*sigev)
1199 {
1200 	file_t		*fp;
1201 	file_t		*prev_fp = NULL;
1202 	int		prev_mode = -1;
1203 	struct vnode	*vp;
1204 	aio_lio_t	*head;
1205 	aio_req_t	*reqp;
1206 	aio_t		*aiop;
1207 	caddr_t		cbplist;
1208 	aiocb_t		cb;
1209 	aiocb_t		*aiocb = &cb;
1210 	aiocb_t		*cbp;
1211 	aiocb_t		**ucbp;
1212 	struct sigevent sigevk;
1213 	sigqueue_t	*sqp;
1214 	int		(*aio_func)();
1215 	int		mode;
1216 	int		error = 0;
1217 	int		aio_errors = 0;
1218 	int		i;
1219 	size_t		ssize;
1220 	int		deadhead = 0;
1221 	int		aio_notsupported = 0;
1222 	int		lio_head_port;
1223 	int		aio_port;
1224 	int		aio_thread;
1225 	port_kevent_t	*pkevtp = NULL;
1226 	int		portused = 0;
1227 	port_notify_t	pnotify;
1228 	int		event;
1229 
1230 	aiop = curproc->p_aio;
1231 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1232 		return (EINVAL);
1233 
1234 	ssize = (sizeof (aiocb_t *) * nent);
1235 	cbplist = kmem_alloc(ssize, KM_SLEEP);
1236 	ucbp = (aiocb_t **)cbplist;
1237 
1238 	if (copyin(aiocb_arg, cbplist, ssize) ||
1239 	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) {
1240 		kmem_free(cbplist, ssize);
1241 		return (EFAULT);
1242 	}
1243 
1244 	/* Event Ports  */
1245 	if (sigev &&
1246 	    (sigevk.sigev_notify == SIGEV_THREAD ||
1247 	    sigevk.sigev_notify == SIGEV_PORT)) {
1248 		if (sigevk.sigev_notify == SIGEV_THREAD) {
1249 			pnotify.portnfy_port = sigevk.sigev_signo;
1250 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
1251 		} else if (copyin(sigevk.sigev_value.sival_ptr,
1252 		    &pnotify, sizeof (pnotify))) {
1253 			kmem_free(cbplist, ssize);
1254 			return (EFAULT);
1255 		}
1256 		error = port_alloc_event(pnotify.portnfy_port,
1257 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
1258 		if (error) {
1259 			if (error == ENOMEM || error == EAGAIN)
1260 				error = EAGAIN;
1261 			else
1262 				error = EINVAL;
1263 			kmem_free(cbplist, ssize);
1264 			return (error);
1265 		}
1266 		lio_head_port = pnotify.portnfy_port;
1267 		portused = 1;
1268 	}
1269 
1270 	/*
1271 	 * a list head should be allocated if notification is
1272 	 * enabled for this list.
1273 	 */
1274 	head = NULL;
1275 
1276 	if (mode_arg == LIO_WAIT || sigev) {
1277 		mutex_enter(&aiop->aio_mutex);
1278 		error = aio_lio_alloc(&head);
1279 		mutex_exit(&aiop->aio_mutex);
1280 		if (error)
1281 			goto done;
1282 		deadhead = 1;
1283 		head->lio_nent = nent;
1284 		head->lio_refcnt = nent;
1285 		head->lio_port = -1;
1286 		head->lio_portkev = NULL;
1287 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
1288 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
1289 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
1290 			if (sqp == NULL) {
1291 				error = EAGAIN;
1292 				goto done;
1293 			}
1294 			sqp->sq_func = NULL;
1295 			sqp->sq_next = NULL;
1296 			sqp->sq_info.si_code = SI_ASYNCIO;
1297 			sqp->sq_info.si_pid = curproc->p_pid;
1298 			sqp->sq_info.si_ctid = PRCTID(curproc);
1299 			sqp->sq_info.si_zoneid = getzoneid();
1300 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
1301 			sqp->sq_info.si_signo = sigevk.sigev_signo;
1302 			sqp->sq_info.si_value = sigevk.sigev_value;
1303 			head->lio_sigqp = sqp;
1304 		} else {
1305 			head->lio_sigqp = NULL;
1306 		}
1307 		if (pkevtp) {
1308 			/*
1309 			 * Prepare data to send when list of aiocb's
1310 			 * has completed.
1311 			 */
1312 			port_init_event(pkevtp, (uintptr_t)sigev,
1313 			    (void *)(uintptr_t)pnotify.portnfy_user,
1314 			    NULL, head);
1315 			pkevtp->portkev_events = AIOLIO;
1316 			head->lio_portkev = pkevtp;
1317 			head->lio_port = pnotify.portnfy_port;
1318 		}
1319 	}
1320 
1321 	for (i = 0; i < nent; i++, ucbp++) {
1322 
1323 		cbp = *ucbp;
1324 		/* skip entry if it can't be copied. */
1325 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
1326 			if (head) {
1327 				mutex_enter(&aiop->aio_mutex);
1328 				head->lio_nent--;
1329 				head->lio_refcnt--;
1330 				mutex_exit(&aiop->aio_mutex);
1331 			}
1332 			continue;
1333 		}
1334 
1335 		/* skip if opcode for aiocb is LIO_NOP */
1336 		mode = aiocb->aio_lio_opcode;
1337 		if (mode == LIO_NOP) {
1338 			cbp = NULL;
1339 			if (head) {
1340 				mutex_enter(&aiop->aio_mutex);
1341 				head->lio_nent--;
1342 				head->lio_refcnt--;
1343 				mutex_exit(&aiop->aio_mutex);
1344 			}
1345 			continue;
1346 		}
1347 
1348 		/* increment file descriptor's ref count. */
1349 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
1350 			lio_set_uerror(&cbp->aio_resultp, EBADF);
1351 			if (head) {
1352 				mutex_enter(&aiop->aio_mutex);
1353 				head->lio_nent--;
1354 				head->lio_refcnt--;
1355 				mutex_exit(&aiop->aio_mutex);
1356 			}
1357 			aio_errors++;
1358 			continue;
1359 		}
1360 
1361 		/*
1362 		 * check the permission of the partition
1363 		 */
1364 		if ((fp->f_flag & mode) == 0) {
1365 			releasef(aiocb->aio_fildes);
1366 			lio_set_uerror(&cbp->aio_resultp, EBADF);
1367 			if (head) {
1368 				mutex_enter(&aiop->aio_mutex);
1369 				head->lio_nent--;
1370 				head->lio_refcnt--;
1371 				mutex_exit(&aiop->aio_mutex);
1372 			}
1373 			aio_errors++;
1374 			continue;
1375 		}
1376 
1377 		/*
1378 		 * common case where requests are to the same fd
1379 		 * for the same r/w operation.
1380 		 * for UFS, need to set EBADFD
1381 		 */
1382 		vp = fp->f_vnode;
1383 		if (fp != prev_fp || mode != prev_mode) {
1384 			aio_func = check_vp(vp, mode);
1385 			if (aio_func == NULL) {
1386 				prev_fp = NULL;
1387 				releasef(aiocb->aio_fildes);
1388 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
1389 				aio_notsupported++;
1390 				if (head) {
1391 					mutex_enter(&aiop->aio_mutex);
1392 					head->lio_nent--;
1393 					head->lio_refcnt--;
1394 					mutex_exit(&aiop->aio_mutex);
1395 				}
1396 				continue;
1397 			} else {
1398 				prev_fp = fp;
1399 				prev_mode = mode;
1400 			}
1401 		}
1402 
1403 		error = aio_req_setup(&reqp, aiop, aiocb,
1404 		    &cbp->aio_resultp, vp, 0);
1405 		if (error) {
1406 			releasef(aiocb->aio_fildes);
1407 			lio_set_uerror(&cbp->aio_resultp, error);
1408 			if (head) {
1409 				mutex_enter(&aiop->aio_mutex);
1410 				head->lio_nent--;
1411 				head->lio_refcnt--;
1412 				mutex_exit(&aiop->aio_mutex);
1413 			}
1414 			aio_errors++;
1415 			continue;
1416 		}
1417 
1418 		reqp->aio_req_lio = head;
1419 		deadhead = 0;
1420 
1421 		/*
1422 		 * Set the errno field now before sending the request to
1423 		 * the driver to avoid a race condition
1424 		 */
1425 		(void) suword32(&cbp->aio_resultp.aio_errno,
1426 		    EINPROGRESS);
1427 
1428 		reqp->aio_req_iocb.iocb = (caddr_t)cbp;
1429 
1430 		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
1431 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
1432 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
1433 		if (aio_port | aio_thread) {
1434 			port_kevent_t *lpkevp;
1435 			/*
1436 			 * Prepare data to send with each aiocb completed.
1437 			 */
1438 			if (aio_port) {
1439 				void *paddr =
1440 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
1441 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
1442 					error = EFAULT;
1443 			} else {	/* aio_thread */
1444 				pnotify.portnfy_port =
1445 				    aiocb->aio_sigevent.sigev_signo;
1446 				pnotify.portnfy_user =
1447 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
1448 			}
1449 			if (error)
1450 				/* EMPTY */;
1451 			else if (pkevtp != NULL &&
1452 			    pnotify.portnfy_port == lio_head_port)
1453 				error = port_dup_event(pkevtp, &lpkevp,
1454 				    PORT_ALLOC_DEFAULT);
1455 			else
1456 				error = port_alloc_event(pnotify.portnfy_port,
1457 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
1458 				    &lpkevp);
1459 			if (error == 0) {
1460 				port_init_event(lpkevp, (uintptr_t)cbp,
1461 				    (void *)(uintptr_t)pnotify.portnfy_user,
1462 				    aio_port_callback, reqp);
1463 				lpkevp->portkev_events = event;
1464 				reqp->aio_req_portkev = lpkevp;
1465 				reqp->aio_req_port = pnotify.portnfy_port;
1466 			}
1467 		}
1468 
1469 		/*
1470 		 * send the request to driver.
1471 		 */
1472 		if (error == 0) {
1473 			if (aiocb->aio_nbytes == 0) {
1474 				clear_active_fd(aiocb->aio_fildes);
1475 				aio_zerolen(reqp);
1476 				continue;
1477 			}
1478 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
1479 			    CRED());
1480 		}
1481 
1482 		/*
1483 		 * the fd's ref count is not decremented until the IO has
1484 		 * completed unless there was an error.
1485 		 */
1486 		if (error) {
1487 			releasef(aiocb->aio_fildes);
1488 			lio_set_uerror(&cbp->aio_resultp, error);
1489 			if (head) {
1490 				mutex_enter(&aiop->aio_mutex);
1491 				head->lio_nent--;
1492 				head->lio_refcnt--;
1493 				mutex_exit(&aiop->aio_mutex);
1494 			}
1495 			if (error == ENOTSUP)
1496 				aio_notsupported++;
1497 			else
1498 				aio_errors++;
1499 			lio_set_error(reqp, portused);
1500 		} else {
1501 			clear_active_fd(aiocb->aio_fildes);
1502 		}
1503 	}
1504 
1505 	if (aio_notsupported) {
1506 		error = ENOTSUP;
1507 	} else if (aio_errors) {
1508 		/*
1509 		 * return EIO if any request failed
1510 		 */
1511 		error = EIO;
1512 	}
1513 
1514 	if (mode_arg == LIO_WAIT) {
1515 		mutex_enter(&aiop->aio_mutex);
1516 		while (head->lio_refcnt > 0) {
1517 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1518 				mutex_exit(&aiop->aio_mutex);
1519 				error = EINTR;
1520 				goto done;
1521 			}
1522 		}
1523 		mutex_exit(&aiop->aio_mutex);
1524 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
1525 	}
1526 
1527 done:
1528 	kmem_free(cbplist, ssize);
1529 	if (deadhead) {
1530 		if (head->lio_sigqp)
1531 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
1532 		if (head->lio_portkev)
1533 			port_free_event(head->lio_portkev);
1534 		kmem_free(head, sizeof (aio_lio_t));
1535 	}
1536 	return (error);
1537 }
1538 
1539 #endif /* _LP64 */
1540 
1541 /*
1542  * Asynchronous list IO.
1543  * If list I/O is called with LIO_WAIT it can still return
1544  * before all the I/O's are completed if a signal is caught
1545  * or if the list include UFS I/O requests. If this happens,
1546  * libaio will call aliowait() to wait for the I/O's to
1547  * complete
1548  */
1549 /*ARGSUSED*/
1550 static int
1551 aliowait(
1552 	int	mode,
1553 	void	*aiocb,
1554 	int	nent,
1555 	void	*sigev,
1556 	int	run_mode)
1557 {
1558 	aio_lio_t	*head;
1559 	aio_t		*aiop;
1560 	caddr_t		cbplist;
1561 	aiocb_t		*cbp, **ucbp;
1562 #ifdef	_SYSCALL32_IMPL
1563 	aiocb32_t	*cbp32;
1564 	caddr32_t	*ucbp32;
1565 	aiocb64_32_t	*cbp64;
1566 #endif
1567 	int		error = 0;
1568 	int		i;
1569 	size_t		ssize = 0;
1570 	model_t		model = get_udatamodel();
1571 
1572 	aiop = curproc->p_aio;
1573 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1574 		return (EINVAL);
1575 
1576 	if (model == DATAMODEL_NATIVE)
1577 		ssize = (sizeof (aiocb_t *) * nent);
1578 #ifdef	_SYSCALL32_IMPL
1579 	else
1580 		ssize = (sizeof (caddr32_t) * nent);
1581 #endif  /* _SYSCALL32_IMPL */
1582 
1583 	if (ssize == 0)
1584 		return (EINVAL);
1585 
1586 	cbplist = kmem_alloc(ssize, KM_SLEEP);
1587 
1588 	if (model == DATAMODEL_NATIVE)
1589 		ucbp = (aiocb_t **)cbplist;
1590 #ifdef	_SYSCALL32_IMPL
1591 	else
1592 		ucbp32 = (caddr32_t *)cbplist;
1593 #endif  /* _SYSCALL32_IMPL */
1594 
1595 	if (copyin(aiocb, cbplist, ssize)) {
1596 		error = EFAULT;
1597 		goto done;
1598 	}
1599 
1600 	/*
1601 	 * To find the list head, we go through the
1602 	 * list of aiocb structs, find the request
1603 	 * its for, then get the list head that reqp
1604 	 * points to
1605 	 */
1606 	head = NULL;
1607 
1608 	for (i = 0; i < nent; i++) {
1609 		if (model == DATAMODEL_NATIVE) {
1610 			/*
1611 			 * Since we are only checking for a NULL pointer
1612 			 * Following should work on both native data sizes
1613 			 * as well as for largefile aiocb.
1614 			 */
1615 			if ((cbp = *ucbp++) == NULL)
1616 				continue;
1617 			if (run_mode != AIO_LARGEFILE)
1618 				if (head = aio_list_get(&cbp->aio_resultp))
1619 					break;
1620 			else {
1621 				/*
1622 				 * This is a case when largefile call is
1623 				 * made on 32 bit kernel.
1624 				 * Treat each pointer as pointer to
1625 				 * aiocb64_32
1626 				 */
1627 				if (head = aio_list_get((aio_result_t *)
1628 				    &(((aiocb64_32_t *)cbp)->aio_resultp)))
1629 					break;
1630 			}
1631 		}
1632 #ifdef	_SYSCALL32_IMPL
1633 		else {
1634 			if (run_mode == AIO_LARGEFILE) {
1635 				if ((cbp64 = (aiocb64_32_t *)
1636 				    (uintptr_t)*ucbp32++) == NULL)
1637 					continue;
1638 				if (head = aio_list_get((aio_result_t *)
1639 				    &cbp64->aio_resultp))
1640 					break;
1641 			} else if (run_mode == AIO_32) {
1642 				if ((cbp32 = (aiocb32_t *)
1643 				    (uintptr_t)*ucbp32++) == NULL)
1644 					continue;
1645 				if (head = aio_list_get((aio_result_t *)
1646 				    &cbp32->aio_resultp))
1647 					break;
1648 			}
1649 		}
1650 #endif	/* _SYSCALL32_IMPL */
1651 	}
1652 
1653 	if (head == NULL) {
1654 		error = EINVAL;
1655 		goto done;
1656 	}
1657 
1658 	mutex_enter(&aiop->aio_mutex);
1659 	while (head->lio_refcnt > 0) {
1660 		if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1661 			mutex_exit(&aiop->aio_mutex);
1662 			error = EINTR;
1663 			goto done;
1664 		}
1665 	}
1666 	mutex_exit(&aiop->aio_mutex);
1667 	alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
1668 done:
1669 	kmem_free(cbplist, ssize);
1670 	return (error);
1671 }
1672 
1673 aio_lio_t *
1674 aio_list_get(aio_result_t *resultp)
1675 {
1676 	aio_lio_t	*head = NULL;
1677 	aio_t		*aiop;
1678 	aio_req_t 	**bucket;
1679 	aio_req_t 	*reqp;
1680 	long		index;
1681 
1682 	aiop = curproc->p_aio;
1683 	if (aiop == NULL)
1684 		return (NULL);
1685 
1686 	if (resultp) {
1687 		index = AIO_HASH(resultp);
1688 		bucket = &aiop->aio_hash[index];
1689 		for (reqp = *bucket; reqp != NULL;
1690 		    reqp = reqp->aio_hash_next) {
1691 			if (reqp->aio_req_resultp == resultp) {
1692 				head = reqp->aio_req_lio;
1693 				return (head);
1694 			}
1695 		}
1696 	}
1697 	return (NULL);
1698 }
1699 
1700 
1701 static void
1702 lio_set_uerror(void *resultp, int error)
1703 {
1704 	/*
1705 	 * the resultp field is a pointer to where the
1706 	 * error should be written out to the user's
1707 	 * aiocb.
1708 	 *
1709 	 */
1710 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1711 		(void) sulword(&((aio_result_t *)resultp)->aio_return,
1712 		    (ssize_t)-1);
1713 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1714 	}
1715 #ifdef	_SYSCALL32_IMPL
1716 	else {
1717 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1718 		    (uint_t)-1);
1719 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1720 	}
1721 #endif  /* _SYSCALL32_IMPL */
1722 }
1723 
1724 /*
1725  * do cleanup completion for all requests in list. memory for
1726  * each request is also freed.
1727  */
1728 static void
1729 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
1730 {
1731 	int i;
1732 	aio_req_t *reqp;
1733 	aio_result_t *resultp;
1734 	aiocb64_32_t *aiocb_64;
1735 
1736 	for (i = 0; i < nent; i++) {
1737 		if (get_udatamodel() == DATAMODEL_NATIVE) {
1738 			if (cbp[i] == NULL)
1739 				continue;
1740 			if (run_mode == AIO_LARGEFILE) {
1741 				aiocb_64 = (aiocb64_32_t *)cbp[i];
1742 				resultp = (aio_result_t *)
1743 				    &aiocb_64->aio_resultp;
1744 			} else
1745 				resultp = &cbp[i]->aio_resultp;
1746 		}
1747 #ifdef	_SYSCALL32_IMPL
1748 		else {
1749 			aiocb32_t *aiocb_32;
1750 			caddr32_t *cbp32;
1751 
1752 			cbp32 = (caddr32_t *)cbp;
1753 			if (cbp32[i] == NULL)
1754 				continue;
1755 			if (run_mode == AIO_32) {
1756 				aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
1757 				resultp = (aio_result_t *)&aiocb_32->
1758 				    aio_resultp;
1759 			} else if (run_mode == AIO_LARGEFILE) {
1760 				aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
1761 				resultp = (aio_result_t *)&aiocb_64->
1762 				    aio_resultp;
1763 			}
1764 		}
1765 #endif  /* _SYSCALL32_IMPL */
1766 		/*
1767 		 * we need to get the aio_cleanupq_mutex since we call
1768 		 * aio_req_done().
1769 		 */
1770 		mutex_enter(&aiop->aio_cleanupq_mutex);
1771 		mutex_enter(&aiop->aio_mutex);
1772 		reqp = aio_req_done(resultp);
1773 		mutex_exit(&aiop->aio_mutex);
1774 		mutex_exit(&aiop->aio_cleanupq_mutex);
1775 		if (reqp != NULL) {
1776 			aphysio_unlock(reqp);
1777 			aio_copyout_result(reqp);
1778 			mutex_enter(&aiop->aio_mutex);
1779 			aio_req_free(aiop, reqp);
1780 			mutex_exit(&aiop->aio_mutex);
1781 		}
1782 	}
1783 }
1784 
1785 /*
1786  * Write out the results for an aio request that is done.
1787  */
1788 static int
1789 aioerror(void *cb, int run_mode)
1790 {
1791 	aio_result_t *resultp;
1792 	aio_t *aiop;
1793 	aio_req_t *reqp;
1794 	int retval;
1795 
1796 	aiop = curproc->p_aio;
1797 	if (aiop == NULL || cb == NULL)
1798 		return (EINVAL);
1799 
1800 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1801 		if (run_mode == AIO_LARGEFILE)
1802 			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1803 			    aio_resultp;
1804 		else
1805 			resultp = &((aiocb_t *)cb)->aio_resultp;
1806 	}
1807 #ifdef	_SYSCALL32_IMPL
1808 	else {
1809 		if (run_mode == AIO_LARGEFILE)
1810 			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1811 			    aio_resultp;
1812 		else if (run_mode == AIO_32)
1813 			resultp = (aio_result_t *)&((aiocb32_t *)cb)->
1814 			    aio_resultp;
1815 	}
1816 #endif  /* _SYSCALL32_IMPL */
1817 	/*
1818 	 * we need to get the aio_cleanupq_mutex since we call
1819 	 * aio_req_find().
1820 	 */
1821 	mutex_enter(&aiop->aio_cleanupq_mutex);
1822 	mutex_enter(&aiop->aio_mutex);
1823 	retval = aio_req_find(resultp, &reqp);
1824 	mutex_exit(&aiop->aio_mutex);
1825 	mutex_exit(&aiop->aio_cleanupq_mutex);
1826 	if (retval == 0) {
1827 		aphysio_unlock(reqp);
1828 		aio_copyout_result(reqp);
1829 		mutex_enter(&aiop->aio_mutex);
1830 		aio_req_free(aiop, reqp);
1831 		mutex_exit(&aiop->aio_mutex);
1832 		return (0);
1833 	} else if (retval == 1)
1834 		return (EINPROGRESS);
1835 	else if (retval == 2)
1836 		return (EINVAL);
1837 	return (0);
1838 }
1839 
1840 /*
1841  * 	aio_cancel - if no requests outstanding,
1842  *			return AIO_ALLDONE
1843  *			else
1844  *			return AIO_NOTCANCELED
1845  */
1846 static int
1847 aio_cancel(
1848 	int	fildes,
1849 	void 	*cb,
1850 	long	*rval,
1851 	int	run_mode)
1852 {
1853 	aio_t *aiop;
1854 	void *resultp;
1855 	int index;
1856 	aio_req_t **bucket;
1857 	aio_req_t *ent;
1858 
1859 
1860 	/*
1861 	 * Verify valid file descriptor
1862 	 */
1863 	if ((getf(fildes)) == NULL) {
1864 		return (EBADF);
1865 	}
1866 	releasef(fildes);
1867 
1868 	aiop = curproc->p_aio;
1869 	if (aiop == NULL)
1870 		return (EINVAL);
1871 
1872 	if (aiop->aio_outstanding == 0) {
1873 		*rval = AIO_ALLDONE;
1874 		return (0);
1875 	}
1876 
1877 	mutex_enter(&aiop->aio_mutex);
1878 	if (cb != NULL) {
1879 		if (get_udatamodel() == DATAMODEL_NATIVE) {
1880 			if (run_mode == AIO_LARGEFILE)
1881 				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1882 				    ->aio_resultp;
1883 			else
1884 				resultp = &((aiocb_t *)cb)->aio_resultp;
1885 		}
1886 #ifdef	_SYSCALL32_IMPL
1887 		else {
1888 			if (run_mode == AIO_LARGEFILE)
1889 				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1890 				    ->aio_resultp;
1891 			else if (run_mode == AIO_32)
1892 				resultp = (aio_result_t *)&((aiocb32_t *)cb)
1893 				    ->aio_resultp;
1894 		}
1895 #endif  /* _SYSCALL32_IMPL */
1896 		index = AIO_HASH(resultp);
1897 		bucket = &aiop->aio_hash[index];
1898 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1899 			if (ent->aio_req_resultp == resultp) {
1900 				if ((ent->aio_req_flags & AIO_PENDING) == 0) {
1901 					mutex_exit(&aiop->aio_mutex);
1902 					*rval = AIO_ALLDONE;
1903 					return (0);
1904 				}
1905 				mutex_exit(&aiop->aio_mutex);
1906 				*rval = AIO_NOTCANCELED;
1907 				return (0);
1908 			}
1909 		}
1910 		mutex_exit(&aiop->aio_mutex);
1911 		*rval = AIO_ALLDONE;
1912 		return (0);
1913 	}
1914 
1915 	for (index = 0; index < AIO_HASHSZ; index++) {
1916 		bucket = &aiop->aio_hash[index];
1917 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1918 			if (ent->aio_req_fd == fildes) {
1919 				if ((ent->aio_req_flags & AIO_PENDING) != 0) {
1920 					mutex_exit(&aiop->aio_mutex);
1921 					*rval = AIO_NOTCANCELED;
1922 					return (0);
1923 				}
1924 			}
1925 		}
1926 	}
1927 	mutex_exit(&aiop->aio_mutex);
1928 	*rval = AIO_ALLDONE;
1929 	return (0);
1930 }
1931 
1932 /*
1933  * solaris version of asynchronous read and write
1934  */
1935 static int
1936 arw(
1937 	int	opcode,
1938 	int	fdes,
1939 	char	*bufp,
1940 	int	bufsize,
1941 	offset_t	offset,
1942 	aio_result_t	*resultp,
1943 	int		mode)
1944 {
1945 	file_t		*fp;
1946 	int		error;
1947 	struct vnode	*vp;
1948 	aio_req_t	*reqp;
1949 	aio_t		*aiop;
1950 	int		(*aio_func)();
1951 #ifdef _LP64
1952 	aiocb_t		aiocb;
1953 #else
1954 	aiocb64_32_t	aiocb64;
1955 #endif
1956 
1957 	aiop = curproc->p_aio;
1958 	if (aiop == NULL)
1959 		return (EINVAL);
1960 
1961 	if ((fp = getf(fdes)) == NULL) {
1962 		return (EBADF);
1963 	}
1964 
1965 	/*
1966 	 * check the permission of the partition
1967 	 */
1968 	if ((fp->f_flag & mode) == 0) {
1969 		releasef(fdes);
1970 		return (EBADF);
1971 	}
1972 
1973 	vp = fp->f_vnode;
1974 	aio_func = check_vp(vp, mode);
1975 	if (aio_func == NULL) {
1976 		releasef(fdes);
1977 		return (EBADFD);
1978 	}
1979 #ifdef _LP64
1980 	aiocb.aio_fildes = fdes;
1981 	aiocb.aio_buf = bufp;
1982 	aiocb.aio_nbytes = bufsize;
1983 	aiocb.aio_offset = offset;
1984 	aiocb.aio_sigevent.sigev_notify = 0;
1985 	error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1);
1986 #else
1987 	aiocb64.aio_fildes = fdes;
1988 	aiocb64.aio_buf = (caddr32_t)bufp;
1989 	aiocb64.aio_nbytes = bufsize;
1990 	aiocb64.aio_offset = offset;
1991 	aiocb64.aio_sigevent.sigev_notify = 0;
1992 	error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1);
1993 #endif
1994 	if (error) {
1995 		releasef(fdes);
1996 		return (error);
1997 	}
1998 
1999 	/*
2000 	 * enable polling on this request if the opcode has
2001 	 * the AIO poll bit set
2002 	 */
2003 	if (opcode & AIO_POLL_BIT)
2004 		reqp->aio_req_flags |= AIO_POLL;
2005 
2006 	if (bufsize == 0) {
2007 		clear_active_fd(fdes);
2008 		aio_zerolen(reqp);
2009 		return (0);
2010 	}
2011 	/*
2012 	 * send the request to driver.
2013 	 */
2014 	error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2015 	/*
2016 	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2017 	 * is released by the aio_cleanup_thread() when the IO has
2018 	 * completed.
2019 	 */
2020 	if (error) {
2021 		releasef(fdes);
2022 		mutex_enter(&aiop->aio_mutex);
2023 		aio_req_free(aiop, reqp);
2024 		aiop->aio_pending--;
2025 		if (aiop->aio_flags & AIO_REQ_BLOCK)
2026 			cv_signal(&aiop->aio_cleanupcv);
2027 		mutex_exit(&aiop->aio_mutex);
2028 		return (error);
2029 	}
2030 	clear_active_fd(fdes);
2031 	return (0);
2032 }
2033 
2034 /*
2035  * posix version of asynchronous read and write
2036  */
2037 static int
2038 aiorw(
2039 	int		opcode,
2040 	void		*aiocb_arg,
2041 	int		mode,
2042 	int		run_mode)
2043 {
2044 #ifdef _SYSCALL32_IMPL
2045 	aiocb32_t	aiocb32;
2046 	struct	sigevent32 *sigev32;
2047 	port_notify32_t	pntfy32;
2048 #endif
2049 	aiocb64_32_t	aiocb64;
2050 	aiocb_t		aiocb;
2051 	file_t		*fp;
2052 	int		error, fd;
2053 	size_t		bufsize;
2054 	struct vnode	*vp;
2055 	aio_req_t	*reqp;
2056 	aio_t		*aiop;
2057 	int		(*aio_func)();
2058 	aio_result_t	*resultp;
2059 	struct	sigevent *sigev;
2060 	model_t		model;
2061 	int		aio_use_port = 0;
2062 	port_notify_t	pntfy;
2063 
2064 	model = get_udatamodel();
2065 	aiop = curproc->p_aio;
2066 	if (aiop == NULL)
2067 		return (EINVAL);
2068 
2069 	if (model == DATAMODEL_NATIVE) {
2070 		if (run_mode != AIO_LARGEFILE) {
2071 			if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
2072 				return (EFAULT);
2073 			bufsize = aiocb.aio_nbytes;
2074 			resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
2075 			if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
2076 				return (EBADF);
2077 			}
2078 			sigev = &aiocb.aio_sigevent;
2079 		} else {
2080 			/*
2081 			 * We come here only when we make largefile
2082 			 * call on 32 bit kernel using 32 bit library.
2083 			 */
2084 			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2085 				return (EFAULT);
2086 			bufsize = aiocb64.aio_nbytes;
2087 			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2088 			    ->aio_resultp);
2089 			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2090 				return (EBADF);
2091 			sigev = (struct sigevent *)&aiocb64.aio_sigevent;
2092 		}
2093 
2094 		if (sigev->sigev_notify == SIGEV_PORT) {
2095 			if (copyin((void *)sigev->sigev_value.sival_ptr,
2096 			    &pntfy, sizeof (port_notify_t))) {
2097 				releasef(fd);
2098 				return (EFAULT);
2099 			}
2100 			aio_use_port = 1;
2101 		} else if (sigev->sigev_notify == SIGEV_THREAD) {
2102 			pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo;
2103 			pntfy.portnfy_user =
2104 			    aiocb.aio_sigevent.sigev_value.sival_ptr;
2105 			aio_use_port = 1;
2106 		}
2107 	}
2108 #ifdef	_SYSCALL32_IMPL
2109 	else {
2110 		if (run_mode == AIO_32) {
2111 			/* 32 bit system call is being made on 64 bit kernel */
2112 			if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
2113 				return (EFAULT);
2114 
2115 			bufsize = aiocb32.aio_nbytes;
2116 			aiocb_32ton(&aiocb32, &aiocb);
2117 			resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
2118 			    aio_resultp);
2119 			if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
2120 				return (EBADF);
2121 			}
2122 			sigev32 = &aiocb32.aio_sigevent;
2123 		} else if (run_mode == AIO_LARGEFILE) {
2124 			/*
2125 			 * We come here only when we make largefile
2126 			 * call on 64 bit kernel using 32 bit library.
2127 			 */
2128 			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2129 				return (EFAULT);
2130 			bufsize = aiocb64.aio_nbytes;
2131 			aiocb_LFton(&aiocb64, &aiocb);
2132 			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2133 			    ->aio_resultp);
2134 			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2135 				return (EBADF);
2136 			sigev32 = &aiocb64.aio_sigevent;
2137 		}
2138 
2139 		if (sigev32->sigev_notify == SIGEV_PORT) {
2140 			if (copyin(
2141 			    (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
2142 			    &pntfy32, sizeof (port_notify32_t))) {
2143 				releasef(fd);
2144 				return (EFAULT);
2145 			}
2146 			pntfy.portnfy_port = pntfy32.portnfy_port;
2147 			pntfy.portnfy_user = (void *)(uintptr_t)
2148 			    pntfy32.portnfy_user;
2149 			aio_use_port = 1;
2150 		} else if (sigev32->sigev_notify == SIGEV_THREAD) {
2151 			pntfy.portnfy_port = sigev32->sigev_signo;
2152 			pntfy.portnfy_user = (void *)(uintptr_t)
2153 			    sigev32->sigev_value.sival_ptr;
2154 			aio_use_port = 1;
2155 		}
2156 	}
2157 #endif  /* _SYSCALL32_IMPL */
2158 
2159 	/*
2160 	 * check the permission of the partition
2161 	 */
2162 
2163 	if ((fp->f_flag & mode) == 0) {
2164 		releasef(fd);
2165 		return (EBADF);
2166 	}
2167 
2168 	vp = fp->f_vnode;
2169 	aio_func = check_vp(vp, mode);
2170 	if (aio_func == NULL) {
2171 		releasef(fd);
2172 		return (EBADFD);
2173 	}
2174 	if (run_mode == AIO_LARGEFILE)
2175 		error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0);
2176 	else
2177 		error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0);
2178 
2179 	if (error) {
2180 		releasef(fd);
2181 		return (error);
2182 	}
2183 	/*
2184 	 * enable polling on this request if the opcode has
2185 	 * the AIO poll bit set
2186 	 */
2187 	if (opcode & AIO_POLL_BIT)
2188 		reqp->aio_req_flags |= AIO_POLL;
2189 
2190 	if (model == DATAMODEL_NATIVE)
2191 		reqp->aio_req_iocb.iocb = aiocb_arg;
2192 #ifdef  _SYSCALL32_IMPL
2193 	else
2194 		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
2195 #endif
2196 
2197 	if (aio_use_port) {
2198 		int event = (run_mode == AIO_LARGEFILE)?
2199 		    ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) :
2200 		    ((mode == FREAD)? AIOAREAD : AIOAWRITE);
2201 		error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event);
2202 	}
2203 
2204 	/*
2205 	 * send the request to driver.
2206 	 */
2207 	if (error == 0) {
2208 		if (bufsize == 0) {
2209 			clear_active_fd(fd);
2210 			aio_zerolen(reqp);
2211 			return (0);
2212 		}
2213 		error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2214 	}
2215 
2216 	/*
2217 	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2218 	 * is released by the aio_cleanup_thread() when the IO has
2219 	 * completed.
2220 	 */
2221 	if (error) {
2222 		releasef(fd);
2223 		mutex_enter(&aiop->aio_mutex);
2224 		if (aio_use_port)
2225 			aio_deq(&aiop->aio_portpending, reqp);
2226 		aio_req_free(aiop, reqp);
2227 		aiop->aio_pending--;
2228 		if (aiop->aio_flags & AIO_REQ_BLOCK)
2229 			cv_signal(&aiop->aio_cleanupcv);
2230 		mutex_exit(&aiop->aio_mutex);
2231 		return (error);
2232 	}
2233 	clear_active_fd(fd);
2234 	return (0);
2235 }
2236 
2237 
2238 /*
2239  * set error for a list IO entry that failed.
2240  */
2241 static void
2242 lio_set_error(aio_req_t *reqp, int portused)
2243 {
2244 	aio_t *aiop = curproc->p_aio;
2245 
2246 	if (aiop == NULL)
2247 		return;
2248 
2249 	mutex_enter(&aiop->aio_mutex);
2250 	if (portused)
2251 		aio_deq(&aiop->aio_portpending, reqp);
2252 	aiop->aio_pending--;
2253 	/* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
2254 	reqp->aio_req_flags |= AIO_PHYSIODONE;
2255 	/*
2256 	 * Need to free the request now as its never
2257 	 * going to get on the done queue
2258 	 *
2259 	 * Note: aio_outstanding is decremented in
2260 	 *	 aio_req_free()
2261 	 */
2262 	aio_req_free(aiop, reqp);
2263 	if (aiop->aio_flags & AIO_REQ_BLOCK)
2264 		cv_signal(&aiop->aio_cleanupcv);
2265 	mutex_exit(&aiop->aio_mutex);
2266 }
2267 
2268 /*
2269  * check if a specified request is done, and remove it from
2270  * the done queue. otherwise remove anybody from the done queue
2271  * if NULL is specified.
2272  */
2273 static aio_req_t *
2274 aio_req_done(void *resultp)
2275 {
2276 	aio_req_t **bucket;
2277 	aio_req_t *ent;
2278 	aio_t *aiop = curproc->p_aio;
2279 	long index;
2280 
2281 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2282 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2283 
2284 	if (resultp) {
2285 		index = AIO_HASH(resultp);
2286 		bucket = &aiop->aio_hash[index];
2287 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2288 			if (ent->aio_req_resultp == (aio_result_t *)resultp) {
2289 				if (ent->aio_req_flags & AIO_DONEQ) {
2290 					return (aio_req_remove(ent));
2291 				}
2292 				return (NULL);
2293 			}
2294 		}
2295 		/* no match, resultp is invalid */
2296 		return (NULL);
2297 	}
2298 	return (aio_req_remove(NULL));
2299 }
2300 
2301 /*
2302  * determine if a user-level resultp pointer is associated with an
2303  * active IO request. Zero is returned when the request is done,
2304  * and the request is removed from the done queue. Only when the
2305  * return value is zero, is the "reqp" pointer valid. One is returned
2306  * when the request is inprogress. Two is returned when the request
2307  * is invalid.
2308  */
2309 static int
2310 aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
2311 {
2312 	aio_req_t **bucket;
2313 	aio_req_t *ent;
2314 	aio_t *aiop = curproc->p_aio;
2315 	long index;
2316 
2317 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2318 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2319 
2320 	index = AIO_HASH(resultp);
2321 	bucket = &aiop->aio_hash[index];
2322 	for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2323 		if (ent->aio_req_resultp == resultp) {
2324 			if (ent->aio_req_flags & AIO_DONEQ) {
2325 				*reqp = aio_req_remove(ent);
2326 				return (0);
2327 			}
2328 			return (1);
2329 		}
2330 	}
2331 	/* no match, resultp is invalid */
2332 	return (2);
2333 }
2334 
2335 /*
2336  * remove a request from the done queue.
2337  */
2338 static aio_req_t *
2339 aio_req_remove(aio_req_t *reqp)
2340 {
2341 	aio_t *aiop = curproc->p_aio;
2342 
2343 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2344 
2345 	if (reqp != NULL) {
2346 		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2347 		if (reqp->aio_req_next == reqp) {
2348 			/* only one request on queue */
2349 			if (reqp ==  aiop->aio_doneq) {
2350 				aiop->aio_doneq = NULL;
2351 			} else {
2352 				ASSERT(reqp == aiop->aio_cleanupq);
2353 				aiop->aio_cleanupq = NULL;
2354 			}
2355 		} else {
2356 			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2357 			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2358 			/*
2359 			 * The request can be either on the aio_doneq or the
2360 			 * aio_cleanupq
2361 			 */
2362 			if (reqp == aiop->aio_doneq)
2363 				aiop->aio_doneq = reqp->aio_req_next;
2364 
2365 			if (reqp == aiop->aio_cleanupq)
2366 				aiop->aio_cleanupq = reqp->aio_req_next;
2367 		}
2368 		reqp->aio_req_flags &= ~AIO_DONEQ;
2369 		reqp->aio_req_next = NULL;
2370 		reqp->aio_req_prev = NULL;
2371 	} else if ((reqp = aiop->aio_doneq) != NULL) {
2372 		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2373 		if (reqp == reqp->aio_req_next) {
2374 			/* only one request on queue */
2375 			aiop->aio_doneq = NULL;
2376 		} else {
2377 			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2378 			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2379 			aiop->aio_doneq = reqp->aio_req_next;
2380 		}
2381 		reqp->aio_req_flags &= ~AIO_DONEQ;
2382 		reqp->aio_req_next = NULL;
2383 		reqp->aio_req_prev = NULL;
2384 	}
2385 	if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN))
2386 		cv_broadcast(&aiop->aio_waitcv);
2387 	return (reqp);
2388 }
2389 
2390 static int
2391 aio_req_setup(
2392 	aio_req_t	**reqpp,
2393 	aio_t 		*aiop,
2394 	aiocb_t 	*arg,
2395 	aio_result_t 	*resultp,
2396 	vnode_t		*vp,
2397 	int		old_solaris_req)
2398 {
2399 	sigqueue_t	*sqp = NULL;
2400 	aio_req_t 	*reqp;
2401 	struct uio 	*uio;
2402 	struct sigevent *sigev;
2403 	int		error;
2404 
2405 	sigev = &arg->aio_sigevent;
2406 	if (sigev->sigev_notify == SIGEV_SIGNAL &&
2407 	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
2408 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2409 		if (sqp == NULL)
2410 			return (EAGAIN);
2411 		sqp->sq_func = NULL;
2412 		sqp->sq_next = NULL;
2413 		sqp->sq_info.si_code = SI_ASYNCIO;
2414 		sqp->sq_info.si_pid = curproc->p_pid;
2415 		sqp->sq_info.si_ctid = PRCTID(curproc);
2416 		sqp->sq_info.si_zoneid = getzoneid();
2417 		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2418 		sqp->sq_info.si_signo = sigev->sigev_signo;
2419 		sqp->sq_info.si_value = sigev->sigev_value;
2420 	}
2421 
2422 	mutex_enter(&aiop->aio_mutex);
2423 
2424 	if (aiop->aio_flags & AIO_REQ_BLOCK) {
2425 		mutex_exit(&aiop->aio_mutex);
2426 		if (sqp)
2427 			kmem_free(sqp, sizeof (sigqueue_t));
2428 		return (EIO);
2429 	}
2430 	/*
2431 	 * get an aio_reqp from the free list or allocate one
2432 	 * from dynamic memory.
2433 	 */
2434 	if (error = aio_req_alloc(&reqp, resultp)) {
2435 		mutex_exit(&aiop->aio_mutex);
2436 		if (sqp)
2437 			kmem_free(sqp, sizeof (sigqueue_t));
2438 		return (error);
2439 	}
2440 	aiop->aio_pending++;
2441 	aiop->aio_outstanding++;
2442 	reqp->aio_req_flags = AIO_PENDING;
2443 	if (old_solaris_req) {
2444 		/* this is an old solaris aio request */
2445 		reqp->aio_req_flags |= AIO_SOLARIS;
2446 		aiop->aio_flags |= AIO_SOLARIS_REQ;
2447 	}
2448 	if (sigev->sigev_notify == SIGEV_THREAD ||
2449 	    sigev->sigev_notify == SIGEV_PORT)
2450 		aio_enq(&aiop->aio_portpending, reqp, 0);
2451 	mutex_exit(&aiop->aio_mutex);
2452 	/*
2453 	 * initialize aio request.
2454 	 */
2455 	reqp->aio_req_fd = arg->aio_fildes;
2456 	reqp->aio_req_sigqp = sqp;
2457 	reqp->aio_req_iocb.iocb = NULL;
2458 	reqp->aio_req_lio = NULL;
2459 	reqp->aio_req_buf.b_file = vp;
2460 	uio = reqp->aio_req.aio_uio;
2461 	uio->uio_iovcnt = 1;
2462 	uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
2463 	uio->uio_iov->iov_len = arg->aio_nbytes;
2464 	uio->uio_loffset = arg->aio_offset;
2465 	*reqpp = reqp;
2466 	return (0);
2467 }
2468 
2469 /*
2470  * Allocate p_aio struct.
2471  */
2472 static aio_t *
2473 aio_aiop_alloc(void)
2474 {
2475 	aio_t	*aiop;
2476 
2477 	ASSERT(MUTEX_HELD(&curproc->p_lock));
2478 
2479 	aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
2480 	if (aiop) {
2481 		mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
2482 		mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
2483 		    NULL);
2484 		mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
2485 	}
2486 	return (aiop);
2487 }
2488 
2489 /*
2490  * Allocate an aio_req struct.
2491  */
2492 static int
2493 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
2494 {
2495 	aio_req_t *reqp;
2496 	aio_t *aiop = curproc->p_aio;
2497 
2498 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2499 
2500 	if ((reqp = aiop->aio_free) != NULL) {
2501 		aiop->aio_free = reqp->aio_req_next;
2502 		bzero(reqp, sizeof (*reqp));
2503 	} else {
2504 		/*
2505 		 * Check whether memory is getting tight.
2506 		 * This is a temporary mechanism to avoid memory
2507 		 * exhaustion by a single process until we come up
2508 		 * with a per process solution such as setrlimit().
2509 		 */
2510 		if (freemem < desfree)
2511 			return (EAGAIN);
2512 		reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
2513 		if (reqp == NULL)
2514 			return (EAGAIN);
2515 	}
2516 	reqp->aio_req.aio_uio = &reqp->aio_req_uio;
2517 	reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov;
2518 	reqp->aio_req.aio_private = reqp;
2519 	reqp->aio_req_buf.b_offset = -1;
2520 	reqp->aio_req_resultp = resultp;
2521 	if (aio_hash_insert(reqp, aiop)) {
2522 		reqp->aio_req_next = aiop->aio_free;
2523 		aiop->aio_free = reqp;
2524 		return (EBUSY);
2525 	}
2526 	*nreqp = reqp;
2527 	return (0);
2528 }
2529 
2530 /*
2531  * Allocate an aio_lio_t struct.
2532  */
2533 static int
2534 aio_lio_alloc(aio_lio_t **head)
2535 {
2536 	aio_lio_t *liop;
2537 	aio_t *aiop = curproc->p_aio;
2538 
2539 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2540 
2541 	if ((liop = aiop->aio_lio_free) != NULL) {
2542 		aiop->aio_lio_free = liop->lio_next;
2543 	} else {
2544 		/*
2545 		 * Check whether memory is getting tight.
2546 		 * This is a temporary mechanism to avoid memory
2547 		 * exhaustion by a single process until we come up
2548 		 * with a per process solution such as setrlimit().
2549 		 */
2550 		if (freemem < desfree)
2551 			return (EAGAIN);
2552 
2553 		liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
2554 		if (liop == NULL)
2555 			return (EAGAIN);
2556 	}
2557 	*head = liop;
2558 	return (0);
2559 }
2560 
2561 /*
2562  * this is a special per-process thread that is only activated if
2563  * the process is unmapping a segment with outstanding aio. normally,
2564  * the process will have completed the aio before unmapping the
2565  * segment. If the process does unmap a segment with outstanding aio,
2566  * this special thread will guarentee that the locked pages due to
2567  * aphysio() are released, thereby permitting the segment to be
2568  * unmapped. In addition to this, the cleanup thread is woken up
2569  * during DR operations to release the locked pages.
2570  */
2571 
2572 static int
2573 aio_cleanup_thread(aio_t *aiop)
2574 {
2575 	proc_t *p = curproc;
2576 	struct as *as = p->p_as;
2577 	int poked = 0;
2578 	kcondvar_t *cvp;
2579 	int exit_flag = 0;
2580 	int rqclnup = 0;
2581 
2582 	sigfillset(&curthread->t_hold);
2583 	sigdiffset(&curthread->t_hold, &cantmask);
2584 	for (;;) {
2585 		/*
2586 		 * if a segment is being unmapped, and the current
2587 		 * process's done queue is not empty, then every request
2588 		 * on the doneq with locked resources should be forced
2589 		 * to release their locks. By moving the doneq request
2590 		 * to the cleanupq, aio_cleanup() will process the cleanupq,
2591 		 * and place requests back onto the doneq. All requests
2592 		 * processed by aio_cleanup() will have their physical
2593 		 * resources unlocked.
2594 		 */
2595 		mutex_enter(&aiop->aio_mutex);
2596 		if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
2597 			aiop->aio_flags |= AIO_CLEANUP;
2598 			mutex_enter(&as->a_contents);
2599 			if (aiop->aio_rqclnup) {
2600 				aiop->aio_rqclnup = 0;
2601 				rqclnup = 1;
2602 			}
2603 			mutex_exit(&as->a_contents);
2604 			if (aiop->aio_doneq) {
2605 				aio_req_t *doneqhead = aiop->aio_doneq;
2606 				aiop->aio_doneq = NULL;
2607 				aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
2608 			}
2609 		}
2610 		mutex_exit(&aiop->aio_mutex);
2611 		aio_cleanup(AIO_CLEANUP_THREAD);
2612 		/*
2613 		 * thread should block on the cleanupcv while
2614 		 * AIO_CLEANUP is set.
2615 		 */
2616 		cvp = &aiop->aio_cleanupcv;
2617 		mutex_enter(&aiop->aio_mutex);
2618 
2619 		if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
2620 		    aiop->aio_notifyq != NULL ||
2621 		    aiop->aio_portcleanupq != NULL) {
2622 			mutex_exit(&aiop->aio_mutex);
2623 			continue;
2624 		}
2625 		mutex_enter(&as->a_contents);
2626 
2627 		/*
2628 		 * AIO_CLEANUP determines when the cleanup thread
2629 		 * should be active. This flag is set when
2630 		 * the cleanup thread is awakened by as_unmap() or
2631 		 * due to DR operations.
2632 		 * The flag is cleared when the blocking as_unmap()
2633 		 * that originally awakened us is allowed to
2634 		 * complete. as_unmap() blocks when trying to
2635 		 * unmap a segment that has SOFTLOCKed pages. when
2636 		 * the segment's pages are all SOFTUNLOCKed,
2637 		 * as->a_flags & AS_UNMAPWAIT should be zero.
2638 		 *
2639 		 * In case of cleanup request by DR, the flag is cleared
2640 		 * once all the pending aio requests have been processed.
2641 		 *
2642 		 * The flag shouldn't be cleared right away if the
2643 		 * cleanup thread was interrupted because the process
2644 		 * is doing forkall(). This happens when cv_wait_sig()
2645 		 * returns zero, because it was awakened by a pokelwps().
2646 		 * If the process is not exiting, it must be doing forkall().
2647 		 */
2648 		if ((poked == 0) &&
2649 		    ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) ||
2650 		    (aiop->aio_pending == 0))) {
2651 			aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
2652 			cvp = &as->a_cv;
2653 			rqclnup = 0;
2654 		}
2655 		mutex_exit(&aiop->aio_mutex);
2656 		if (poked) {
2657 			/*
2658 			 * If the process is exiting/killed, don't return
2659 			 * immediately without waiting for pending I/O's
2660 			 * and releasing the page locks.
2661 			 */
2662 			if (p->p_flag & (SEXITLWPS|SKILLED)) {
2663 				/*
2664 				 * If exit_flag is set, then it is
2665 				 * safe to exit because we have released
2666 				 * page locks of completed I/O's.
2667 				 */
2668 				if (exit_flag)
2669 					break;
2670 
2671 				mutex_exit(&as->a_contents);
2672 
2673 				/*
2674 				 * Wait for all the pending aio to complete.
2675 				 */
2676 				mutex_enter(&aiop->aio_mutex);
2677 				aiop->aio_flags |= AIO_REQ_BLOCK;
2678 				while (aiop->aio_pending != 0)
2679 					cv_wait(&aiop->aio_cleanupcv,
2680 					    &aiop->aio_mutex);
2681 				mutex_exit(&aiop->aio_mutex);
2682 				exit_flag = 1;
2683 				continue;
2684 			} else if (p->p_flag &
2685 			    (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
2686 				/*
2687 				 * hold LWP until it
2688 				 * is continued.
2689 				 */
2690 				mutex_exit(&as->a_contents);
2691 				mutex_enter(&p->p_lock);
2692 				stop(PR_SUSPENDED, SUSPEND_NORMAL);
2693 				mutex_exit(&p->p_lock);
2694 				poked = 0;
2695 				continue;
2696 			}
2697 		} else {
2698 			/*
2699 			 * When started this thread will sleep on as->a_cv.
2700 			 * as_unmap will awake this thread if the
2701 			 * segment has SOFTLOCKed pages (poked = 0).
2702 			 * 1. pokelwps() awakes this thread =>
2703 			 *    break the loop to check SEXITLWPS, SHOLDFORK, etc
2704 			 * 2. as_unmap awakes this thread =>
2705 			 *    to break the loop it is necessary that
2706 			 *    - AS_UNMAPWAIT is set (as_unmap is waiting for
2707 			 *	memory to be unlocked)
2708 			 *    - AIO_CLEANUP is not set
2709 			 *	(if AIO_CLEANUP is set we have to wait for
2710 			 *	pending requests. aio_done will send a signal
2711 			 *	for every request which completes to continue
2712 			 *	unmapping the corresponding address range)
2713 			 * 3. A cleanup request will wake this thread up, ex.
2714 			 *    by the DR operations. The aio_rqclnup flag will
2715 			 *    be set.
2716 			 */
2717 			while (poked == 0) {
2718 				/*
2719 				 * The clean up requests that came in
2720 				 * after we had just cleaned up, couldn't
2721 				 * be causing the unmap thread to block - as
2722 				 * unmap event happened first.
2723 				 * Let aio_done() wake us up if it sees a need.
2724 				 */
2725 				if (aiop->aio_rqclnup &&
2726 				    (aiop->aio_flags & AIO_CLEANUP) == 0)
2727 					break;
2728 				poked = !cv_wait_sig(cvp, &as->a_contents);
2729 				if (AS_ISUNMAPWAIT(as) == 0)
2730 					cv_signal(cvp);
2731 				if (aiop->aio_outstanding != 0)
2732 					break;
2733 			}
2734 		}
2735 		mutex_exit(&as->a_contents);
2736 	}
2737 exit:
2738 	mutex_exit(&as->a_contents);
2739 	ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
2740 	aston(curthread);	/* make thread do post_syscall */
2741 	return (0);
2742 }
2743 
2744 /*
2745  * save a reference to a user's outstanding aio in a hash list.
2746  */
2747 static int
2748 aio_hash_insert(
2749 	aio_req_t *aio_reqp,
2750 	aio_t *aiop)
2751 {
2752 	long index;
2753 	aio_result_t *resultp = aio_reqp->aio_req_resultp;
2754 	aio_req_t *current;
2755 	aio_req_t **nextp;
2756 
2757 	index = AIO_HASH(resultp);
2758 	nextp = &aiop->aio_hash[index];
2759 	while ((current = *nextp) != NULL) {
2760 		if (current->aio_req_resultp == resultp)
2761 			return (DUPLICATE);
2762 		nextp = &current->aio_hash_next;
2763 	}
2764 	*nextp = aio_reqp;
2765 	aio_reqp->aio_hash_next = NULL;
2766 	return (0);
2767 }
2768 
2769 static int
2770 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
2771     cred_t *)
2772 {
2773 	struct snode *sp;
2774 	dev_t		dev;
2775 	struct cb_ops  	*cb;
2776 	major_t		major;
2777 	int		(*aio_func)();
2778 
2779 	dev = vp->v_rdev;
2780 	major = getmajor(dev);
2781 
2782 	/*
2783 	 * return NULL for requests to files and STREAMs so
2784 	 * that libaio takes care of them.
2785 	 */
2786 	if (vp->v_type == VCHR) {
2787 		/* no stream device for kaio */
2788 		if (STREAMSTAB(major)) {
2789 			return (NULL);
2790 		}
2791 	} else {
2792 		return (NULL);
2793 	}
2794 
2795 	/*
2796 	 * Check old drivers which do not have async I/O entry points.
2797 	 */
2798 	if (devopsp[major]->devo_rev < 3)
2799 		return (NULL);
2800 
2801 	cb = devopsp[major]->devo_cb_ops;
2802 
2803 	if (cb->cb_rev < 1)
2804 		return (NULL);
2805 
2806 	/*
2807 	 * Check whether this device is a block device.
2808 	 * Kaio is not supported for devices like tty.
2809 	 */
2810 	if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
2811 		return (NULL);
2812 
2813 	/*
2814 	 * Clustering: If vnode is a PXFS vnode, then the device may be remote.
2815 	 * We cannot call the driver directly. Instead return the
2816 	 * PXFS functions.
2817 	 */
2818 
2819 	if (IS_PXFSVP(vp)) {
2820 		if (mode & FREAD)
2821 			return (clpxfs_aio_read);
2822 		else
2823 			return (clpxfs_aio_write);
2824 	}
2825 	if (mode & FREAD)
2826 		aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
2827 	else
2828 		aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;
2829 
2830 	/*
2831 	 * Do we need this ?
2832 	 * nodev returns ENXIO anyway.
2833 	 */
2834 	if (aio_func == nodev)
2835 		return (NULL);
2836 
2837 	sp = VTOS(vp);
2838 	smark(sp, SACC);
2839 	return (aio_func);
2840 }
2841 
2842 /*
2843  * Clustering: We want check_vp to return a function prototyped
2844  * correctly that will be common to both PXFS and regular case.
2845  * We define this intermediate function that will do the right
2846  * thing for driver cases.
2847  */
2848 
2849 static int
2850 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2851 {
2852 	dev_t dev;
2853 	struct cb_ops  	*cb;
2854 
2855 	ASSERT(vp->v_type == VCHR);
2856 	ASSERT(!IS_PXFSVP(vp));
2857 	dev = VTOS(vp)->s_dev;
2858 	ASSERT(STREAMSTAB(getmajor(dev)) == NULL);
2859 
2860 	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2861 
2862 	ASSERT(cb->cb_awrite != nodev);
2863 	return ((*cb->cb_awrite)(dev, aio, cred_p));
2864 }
2865 
2866 /*
2867  * Clustering: We want check_vp to return a function prototyped
2868  * correctly that will be common to both PXFS and regular case.
2869  * We define this intermediate function that will do the right
2870  * thing for driver cases.
2871  */
2872 
2873 static int
2874 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2875 {
2876 	dev_t dev;
2877 	struct cb_ops  	*cb;
2878 
2879 	ASSERT(vp->v_type == VCHR);
2880 	ASSERT(!IS_PXFSVP(vp));
2881 	dev = VTOS(vp)->s_dev;
2882 	ASSERT(!STREAMSTAB(getmajor(dev)));
2883 
2884 	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2885 
2886 	ASSERT(cb->cb_aread != nodev);
2887 	return ((*cb->cb_aread)(dev, aio, cred_p));
2888 }
2889 
2890 /*
2891  * This routine is called when a largefile call is made by a 32bit
2892  * process on a ILP32 or LP64 kernel. All 64bit processes are large
2893  * file by definition and will call alio() instead.
2894  */
2895 static int
2896 alioLF(
2897 	int		mode_arg,
2898 	void		*aiocb_arg,
2899 	int		nent,
2900 	void		*sigev)
2901 {
2902 	file_t		*fp;
2903 	file_t		*prev_fp = NULL;
2904 	int		prev_mode = -1;
2905 	struct vnode	*vp;
2906 	aio_lio_t	*head;
2907 	aio_req_t	*reqp;
2908 	aio_t		*aiop;
2909 	caddr_t		cbplist;
2910 	aiocb64_32_t	cb64;
2911 	aiocb64_32_t	*aiocb = &cb64;
2912 	aiocb64_32_t	*cbp;
2913 	caddr32_t	*ucbp;
2914 #ifdef _LP64
2915 	aiocb_t		aiocb_n;
2916 #endif
2917 	struct sigevent32	sigevk;
2918 	sigqueue_t	*sqp;
2919 	int		(*aio_func)();
2920 	int		mode;
2921 	int		error = 0;
2922 	int		aio_errors = 0;
2923 	int		i;
2924 	size_t		ssize;
2925 	int		deadhead = 0;
2926 	int		aio_notsupported = 0;
2927 	int		lio_head_port;
2928 	int		aio_port;
2929 	int		aio_thread;
2930 	port_kevent_t	*pkevtp = NULL;
2931 	int		portused = 0;
2932 	port_notify32_t	pnotify;
2933 	int		event;
2934 
2935 	aiop = curproc->p_aio;
2936 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
2937 		return (EINVAL);
2938 
2939 	ASSERT(get_udatamodel() == DATAMODEL_ILP32);
2940 
2941 	ssize = (sizeof (caddr32_t) * nent);
2942 	cbplist = kmem_alloc(ssize, KM_SLEEP);
2943 	ucbp = (caddr32_t *)cbplist;
2944 
2945 	if (copyin(aiocb_arg, cbplist, ssize) ||
2946 	    (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) {
2947 		kmem_free(cbplist, ssize);
2948 		return (EFAULT);
2949 	}
2950 
2951 	/* Event Ports  */
2952 	if (sigev &&
2953 	    (sigevk.sigev_notify == SIGEV_THREAD ||
2954 	    sigevk.sigev_notify == SIGEV_PORT)) {
2955 		if (sigevk.sigev_notify == SIGEV_THREAD) {
2956 			pnotify.portnfy_port = sigevk.sigev_signo;
2957 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
2958 		} else if (copyin(
2959 		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
2960 		    &pnotify, sizeof (pnotify))) {
2961 			kmem_free(cbplist, ssize);
2962 			return (EFAULT);
2963 		}
2964 		error = port_alloc_event(pnotify.portnfy_port,
2965 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
2966 		if (error) {
2967 			if (error == ENOMEM || error == EAGAIN)
2968 				error = EAGAIN;
2969 			else
2970 				error = EINVAL;
2971 			kmem_free(cbplist, ssize);
2972 			return (error);
2973 		}
2974 		lio_head_port = pnotify.portnfy_port;
2975 		portused = 1;
2976 	}
2977 
2978 	/*
2979 	 * a list head should be allocated if notification is
2980 	 * enabled for this list.
2981 	 */
2982 	head = NULL;
2983 
2984 	if (mode_arg == LIO_WAIT || sigev) {
2985 		mutex_enter(&aiop->aio_mutex);
2986 		error = aio_lio_alloc(&head);
2987 		mutex_exit(&aiop->aio_mutex);
2988 		if (error)
2989 			goto done;
2990 		deadhead = 1;
2991 		head->lio_nent = nent;
2992 		head->lio_refcnt = nent;
2993 		head->lio_port = -1;
2994 		head->lio_portkev = NULL;
2995 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
2996 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
2997 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2998 			if (sqp == NULL) {
2999 				error = EAGAIN;
3000 				goto done;
3001 			}
3002 			sqp->sq_func = NULL;
3003 			sqp->sq_next = NULL;
3004 			sqp->sq_info.si_code = SI_ASYNCIO;
3005 			sqp->sq_info.si_pid = curproc->p_pid;
3006 			sqp->sq_info.si_ctid = PRCTID(curproc);
3007 			sqp->sq_info.si_zoneid = getzoneid();
3008 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3009 			sqp->sq_info.si_signo = sigevk.sigev_signo;
3010 			sqp->sq_info.si_value.sival_int =
3011 			    sigevk.sigev_value.sival_int;
3012 			head->lio_sigqp = sqp;
3013 		} else {
3014 			head->lio_sigqp = NULL;
3015 		}
3016 		if (pkevtp) {
3017 			/*
3018 			 * Prepare data to send when list of aiocb's
3019 			 * has completed.
3020 			 */
3021 			port_init_event(pkevtp, (uintptr_t)sigev,
3022 			    (void *)(uintptr_t)pnotify.portnfy_user,
3023 			    NULL, head);
3024 			pkevtp->portkev_events = AIOLIO64;
3025 			head->lio_portkev = pkevtp;
3026 			head->lio_port = pnotify.portnfy_port;
3027 		}
3028 	}
3029 
3030 	for (i = 0; i < nent; i++, ucbp++) {
3031 
3032 		cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
3033 		/* skip entry if it can't be copied. */
3034 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
3035 			if (head) {
3036 				mutex_enter(&aiop->aio_mutex);
3037 				head->lio_nent--;
3038 				head->lio_refcnt--;
3039 				mutex_exit(&aiop->aio_mutex);
3040 			}
3041 			continue;
3042 		}
3043 
3044 		/* skip if opcode for aiocb is LIO_NOP */
3045 		mode = aiocb->aio_lio_opcode;
3046 		if (mode == LIO_NOP) {
3047 			cbp = NULL;
3048 			if (head) {
3049 				mutex_enter(&aiop->aio_mutex);
3050 				head->lio_nent--;
3051 				head->lio_refcnt--;
3052 				mutex_exit(&aiop->aio_mutex);
3053 			}
3054 			continue;
3055 		}
3056 
3057 		/* increment file descriptor's ref count. */
3058 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3059 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3060 			if (head) {
3061 				mutex_enter(&aiop->aio_mutex);
3062 				head->lio_nent--;
3063 				head->lio_refcnt--;
3064 				mutex_exit(&aiop->aio_mutex);
3065 			}
3066 			aio_errors++;
3067 			continue;
3068 		}
3069 
3070 		/*
3071 		 * check the permission of the partition
3072 		 */
3073 		if ((fp->f_flag & mode) == 0) {
3074 			releasef(aiocb->aio_fildes);
3075 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3076 			if (head) {
3077 				mutex_enter(&aiop->aio_mutex);
3078 				head->lio_nent--;
3079 				head->lio_refcnt--;
3080 				mutex_exit(&aiop->aio_mutex);
3081 			}
3082 			aio_errors++;
3083 			continue;
3084 		}
3085 
3086 		/*
3087 		 * common case where requests are to the same fd
3088 		 * for the same r/w operation
3089 		 * for UFS, need to set EBADFD
3090 		 */
3091 		vp = fp->f_vnode;
3092 		if (fp != prev_fp || mode != prev_mode) {
3093 			aio_func = check_vp(vp, mode);
3094 			if (aio_func == NULL) {
3095 				prev_fp = NULL;
3096 				releasef(aiocb->aio_fildes);
3097 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
3098 				aio_notsupported++;
3099 				if (head) {
3100 					mutex_enter(&aiop->aio_mutex);
3101 					head->lio_nent--;
3102 					head->lio_refcnt--;
3103 					mutex_exit(&aiop->aio_mutex);
3104 				}
3105 				continue;
3106 			} else {
3107 				prev_fp = fp;
3108 				prev_mode = mode;
3109 			}
3110 		}
3111 
3112 #ifdef	_LP64
3113 		aiocb_LFton(aiocb, &aiocb_n);
3114 		error = aio_req_setup(&reqp, aiop, &aiocb_n,
3115 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3116 #else
3117 		error = aio_req_setupLF(&reqp, aiop, aiocb,
3118 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3119 #endif  /* _LP64 */
3120 		if (error) {
3121 			releasef(aiocb->aio_fildes);
3122 			lio_set_uerror(&cbp->aio_resultp, error);
3123 			if (head) {
3124 				mutex_enter(&aiop->aio_mutex);
3125 				head->lio_nent--;
3126 				head->lio_refcnt--;
3127 				mutex_exit(&aiop->aio_mutex);
3128 			}
3129 			aio_errors++;
3130 			continue;
3131 		}
3132 
3133 		reqp->aio_req_lio = head;
3134 		deadhead = 0;
3135 
3136 		/*
3137 		 * Set the errno field now before sending the request to
3138 		 * the driver to avoid a race condition
3139 		 */
3140 		(void) suword32(&cbp->aio_resultp.aio_errno,
3141 		    EINPROGRESS);
3142 
3143 		reqp->aio_req_iocb.iocb32 = *ucbp;
3144 
3145 		event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64;
3146 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3147 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3148 		if (aio_port | aio_thread) {
3149 			port_kevent_t *lpkevp;
3150 			/*
3151 			 * Prepare data to send with each aiocb completed.
3152 			 */
3153 			if (aio_port) {
3154 				void *paddr = (void *)(uintptr_t)
3155 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3156 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3157 					error = EFAULT;
3158 			} else {	/* aio_thread */
3159 				pnotify.portnfy_port =
3160 				    aiocb->aio_sigevent.sigev_signo;
3161 				pnotify.portnfy_user =
3162 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3163 			}
3164 			if (error)
3165 				/* EMPTY */;
3166 			else if (pkevtp != NULL &&
3167 			    pnotify.portnfy_port == lio_head_port)
3168 				error = port_dup_event(pkevtp, &lpkevp,
3169 				    PORT_ALLOC_DEFAULT);
3170 			else
3171 				error = port_alloc_event(pnotify.portnfy_port,
3172 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3173 				    &lpkevp);
3174 			if (error == 0) {
3175 				port_init_event(lpkevp, (uintptr_t)*ucbp,
3176 				    (void *)(uintptr_t)pnotify.portnfy_user,
3177 				    aio_port_callback, reqp);
3178 				lpkevp->portkev_events = event;
3179 				reqp->aio_req_portkev = lpkevp;
3180 				reqp->aio_req_port = pnotify.portnfy_port;
3181 			}
3182 		}
3183 
3184 		/*
3185 		 * send the request to driver.
3186 		 */
3187 		if (error == 0) {
3188 			if (aiocb->aio_nbytes == 0) {
3189 				clear_active_fd(aiocb->aio_fildes);
3190 				aio_zerolen(reqp);
3191 				continue;
3192 			}
3193 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3194 			    CRED());
3195 		}
3196 
3197 		/*
3198 		 * the fd's ref count is not decremented until the IO has
3199 		 * completed unless there was an error.
3200 		 */
3201 		if (error) {
3202 			releasef(aiocb->aio_fildes);
3203 			lio_set_uerror(&cbp->aio_resultp, error);
3204 			if (head) {
3205 				mutex_enter(&aiop->aio_mutex);
3206 				head->lio_nent--;
3207 				head->lio_refcnt--;
3208 				mutex_exit(&aiop->aio_mutex);
3209 			}
3210 			if (error == ENOTSUP)
3211 				aio_notsupported++;
3212 			else
3213 				aio_errors++;
3214 			lio_set_error(reqp, portused);
3215 		} else {
3216 			clear_active_fd(aiocb->aio_fildes);
3217 		}
3218 	}
3219 
3220 	if (aio_notsupported) {
3221 		error = ENOTSUP;
3222 	} else if (aio_errors) {
3223 		/*
3224 		 * return EIO if any request failed
3225 		 */
3226 		error = EIO;
3227 	}
3228 
3229 	if (mode_arg == LIO_WAIT) {
3230 		mutex_enter(&aiop->aio_mutex);
3231 		while (head->lio_refcnt > 0) {
3232 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3233 				mutex_exit(&aiop->aio_mutex);
3234 				error = EINTR;
3235 				goto done;
3236 			}
3237 		}
3238 		mutex_exit(&aiop->aio_mutex);
3239 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
3240 	}
3241 
3242 done:
3243 	kmem_free(cbplist, ssize);
3244 	if (deadhead) {
3245 		if (head->lio_sigqp)
3246 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3247 		if (head->lio_portkev)
3248 			port_free_event(head->lio_portkev);
3249 		kmem_free(head, sizeof (aio_lio_t));
3250 	}
3251 	return (error);
3252 }
3253 
3254 #ifdef  _SYSCALL32_IMPL
3255 static void
3256 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
3257 {
3258 	dest->aio_fildes = src->aio_fildes;
3259 	dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
3260 	dest->aio_nbytes = (size_t)src->aio_nbytes;
3261 	dest->aio_offset = (off_t)src->aio_offset;
3262 	dest->aio_reqprio = src->aio_reqprio;
3263 	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3264 	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3265 
3266 	/*
3267 	 * See comment in sigqueue32() on handling of 32-bit
3268 	 * sigvals in a 64-bit kernel.
3269 	 */
3270 	dest->aio_sigevent.sigev_value.sival_int =
3271 	    (int)src->aio_sigevent.sigev_value.sival_int;
3272 	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3273 	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3274 	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3275 	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3276 	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3277 	dest->aio_lio_opcode = src->aio_lio_opcode;
3278 	dest->aio_state = src->aio_state;
3279 	dest->aio__pad[0] = src->aio__pad[0];
3280 }
3281 #endif
3282 
3283 /*
3284  * This function is used only for largefile calls made by
3285  * 32 bit applications.
3286  */
3287 static int
3288 aio_req_setupLF(
3289 	aio_req_t	**reqpp,
3290 	aio_t		*aiop,
3291 	aiocb64_32_t	*arg,
3292 	aio_result_t	*resultp,
3293 	vnode_t		*vp,
3294 	int		old_solaris_req)
3295 {
3296 	sigqueue_t	*sqp = NULL;
3297 	aio_req_t	*reqp;
3298 	struct uio	*uio;
3299 	struct sigevent32 *sigev;
3300 	int 		error;
3301 
3302 	sigev = &arg->aio_sigevent;
3303 	if (sigev->sigev_notify == SIGEV_SIGNAL &&
3304 	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
3305 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3306 		if (sqp == NULL)
3307 			return (EAGAIN);
3308 		sqp->sq_func = NULL;
3309 		sqp->sq_next = NULL;
3310 		sqp->sq_info.si_code = SI_ASYNCIO;
3311 		sqp->sq_info.si_pid = curproc->p_pid;
3312 		sqp->sq_info.si_ctid = PRCTID(curproc);
3313 		sqp->sq_info.si_zoneid = getzoneid();
3314 		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3315 		sqp->sq_info.si_signo = sigev->sigev_signo;
3316 		sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int;
3317 	}
3318 
3319 	mutex_enter(&aiop->aio_mutex);
3320 
3321 	if (aiop->aio_flags & AIO_REQ_BLOCK) {
3322 		mutex_exit(&aiop->aio_mutex);
3323 		if (sqp)
3324 			kmem_free(sqp, sizeof (sigqueue_t));
3325 		return (EIO);
3326 	}
3327 	/*
3328 	 * get an aio_reqp from the free list or allocate one
3329 	 * from dynamic memory.
3330 	 */
3331 	if (error = aio_req_alloc(&reqp, resultp)) {
3332 		mutex_exit(&aiop->aio_mutex);
3333 		if (sqp)
3334 			kmem_free(sqp, sizeof (sigqueue_t));
3335 		return (error);
3336 	}
3337 	aiop->aio_pending++;
3338 	aiop->aio_outstanding++;
3339 	reqp->aio_req_flags = AIO_PENDING;
3340 	if (old_solaris_req) {
3341 		/* this is an old solaris aio request */
3342 		reqp->aio_req_flags |= AIO_SOLARIS;
3343 		aiop->aio_flags |= AIO_SOLARIS_REQ;
3344 	}
3345 	if (sigev->sigev_notify == SIGEV_THREAD ||
3346 	    sigev->sigev_notify == SIGEV_PORT)
3347 		aio_enq(&aiop->aio_portpending, reqp, 0);
3348 	mutex_exit(&aiop->aio_mutex);
3349 	/*
3350 	 * initialize aio request.
3351 	 */
3352 	reqp->aio_req_fd = arg->aio_fildes;
3353 	reqp->aio_req_sigqp = sqp;
3354 	reqp->aio_req_iocb.iocb = NULL;
3355 	reqp->aio_req_lio = NULL;
3356 	reqp->aio_req_buf.b_file = vp;
3357 	uio = reqp->aio_req.aio_uio;
3358 	uio->uio_iovcnt = 1;
3359 	uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
3360 	uio->uio_iov->iov_len = arg->aio_nbytes;
3361 	uio->uio_loffset = arg->aio_offset;
3362 	*reqpp = reqp;
3363 	return (0);
3364 }
3365 
3366 /*
3367  * This routine is called when a non largefile call is made by a 32bit
3368  * process on a ILP32 or LP64 kernel.
3369  */
3370 static int
3371 alio32(
3372 	int		mode_arg,
3373 	void		*aiocb_arg,
3374 	int		nent,
3375 	void		*sigev)
3376 {
3377 	file_t		*fp;
3378 	file_t		*prev_fp = NULL;
3379 	int		prev_mode = -1;
3380 	struct vnode	*vp;
3381 	aio_lio_t	*head;
3382 	aio_req_t	*reqp;
3383 	aio_t		*aiop;
3384 	caddr_t		cbplist;
3385 	aiocb_t		cb;
3386 	aiocb_t		*aiocb = &cb;
3387 #ifdef	_LP64
3388 	aiocb32_t	*cbp;
3389 	caddr32_t	*ucbp;
3390 	aiocb32_t	cb32;
3391 	aiocb32_t	*aiocb32 = &cb32;
3392 	struct sigevent32	sigevk;
3393 #else
3394 	aiocb_t		*cbp, **ucbp;
3395 	struct sigevent	sigevk;
3396 #endif
3397 	sigqueue_t	*sqp;
3398 	int		(*aio_func)();
3399 	int		mode;
3400 	int		error = 0;
3401 	int		aio_errors = 0;
3402 	int		i;
3403 	size_t		ssize;
3404 	int		deadhead = 0;
3405 	int		aio_notsupported = 0;
3406 	int		lio_head_port;
3407 	int		aio_port;
3408 	int		aio_thread;
3409 	port_kevent_t	*pkevtp = NULL;
3410 	int		portused = 0;
3411 #ifdef	_LP64
3412 	port_notify32_t	pnotify;
3413 #else
3414 	port_notify_t	pnotify;
3415 #endif
3416 	int		event;
3417 
3418 	aiop = curproc->p_aio;
3419 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
3420 		return (EINVAL);
3421 
3422 #ifdef	_LP64
3423 	ssize = (sizeof (caddr32_t) * nent);
3424 #else
3425 	ssize = (sizeof (aiocb_t *) * nent);
3426 #endif
3427 	cbplist = kmem_alloc(ssize, KM_SLEEP);
3428 	ucbp = (void *)cbplist;
3429 
3430 	if (copyin(aiocb_arg, cbplist, ssize) ||
3431 	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) {
3432 		kmem_free(cbplist, ssize);
3433 		return (EFAULT);
3434 	}
3435 
3436 	/* Event Ports  */
3437 	if (sigev &&
3438 	    (sigevk.sigev_notify == SIGEV_THREAD ||
3439 	    sigevk.sigev_notify == SIGEV_PORT)) {
3440 		if (sigevk.sigev_notify == SIGEV_THREAD) {
3441 			pnotify.portnfy_port = sigevk.sigev_signo;
3442 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
3443 		} else if (copyin(
3444 		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
3445 		    &pnotify, sizeof (pnotify))) {
3446 			kmem_free(cbplist, ssize);
3447 			return (EFAULT);
3448 		}
3449 		error = port_alloc_event(pnotify.portnfy_port,
3450 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
3451 		if (error) {
3452 			if (error == ENOMEM || error == EAGAIN)
3453 				error = EAGAIN;
3454 			else
3455 				error = EINVAL;
3456 			kmem_free(cbplist, ssize);
3457 			return (error);
3458 		}
3459 		lio_head_port = pnotify.portnfy_port;
3460 		portused = 1;
3461 	}
3462 
3463 	/*
3464 	 * a list head should be allocated if notification is
3465 	 * enabled for this list.
3466 	 */
3467 	head = NULL;
3468 
3469 	if (mode_arg == LIO_WAIT || sigev) {
3470 		mutex_enter(&aiop->aio_mutex);
3471 		error = aio_lio_alloc(&head);
3472 		mutex_exit(&aiop->aio_mutex);
3473 		if (error)
3474 			goto done;
3475 		deadhead = 1;
3476 		head->lio_nent = nent;
3477 		head->lio_refcnt = nent;
3478 		head->lio_port = -1;
3479 		head->lio_portkev = NULL;
3480 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
3481 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
3482 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3483 			if (sqp == NULL) {
3484 				error = EAGAIN;
3485 				goto done;
3486 			}
3487 			sqp->sq_func = NULL;
3488 			sqp->sq_next = NULL;
3489 			sqp->sq_info.si_code = SI_ASYNCIO;
3490 			sqp->sq_info.si_pid = curproc->p_pid;
3491 			sqp->sq_info.si_ctid = PRCTID(curproc);
3492 			sqp->sq_info.si_zoneid = getzoneid();
3493 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3494 			sqp->sq_info.si_signo = sigevk.sigev_signo;
3495 			sqp->sq_info.si_value.sival_int =
3496 			    sigevk.sigev_value.sival_int;
3497 			head->lio_sigqp = sqp;
3498 		} else {
3499 			head->lio_sigqp = NULL;
3500 		}
3501 		if (pkevtp) {
3502 			/*
3503 			 * Prepare data to send when list of aiocb's has
3504 			 * completed.
3505 			 */
3506 			port_init_event(pkevtp, (uintptr_t)sigev,
3507 			    (void *)(uintptr_t)pnotify.portnfy_user,
3508 			    NULL, head);
3509 			pkevtp->portkev_events = AIOLIO;
3510 			head->lio_portkev = pkevtp;
3511 			head->lio_port = pnotify.portnfy_port;
3512 		}
3513 	}
3514 
3515 	for (i = 0; i < nent; i++, ucbp++) {
3516 
3517 		/* skip entry if it can't be copied. */
3518 #ifdef	_LP64
3519 		cbp = (aiocb32_t *)(uintptr_t)*ucbp;
3520 		if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32)))
3521 #else
3522 		cbp = (aiocb_t *)*ucbp;
3523 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb)))
3524 #endif
3525 		{
3526 			if (head) {
3527 				mutex_enter(&aiop->aio_mutex);
3528 				head->lio_nent--;
3529 				head->lio_refcnt--;
3530 				mutex_exit(&aiop->aio_mutex);
3531 			}
3532 			continue;
3533 		}
3534 #ifdef	_LP64
3535 		/*
3536 		 * copy 32 bit structure into 64 bit structure
3537 		 */
3538 		aiocb_32ton(aiocb32, aiocb);
3539 #endif /* _LP64 */
3540 
3541 		/* skip if opcode for aiocb is LIO_NOP */
3542 		mode = aiocb->aio_lio_opcode;
3543 		if (mode == LIO_NOP) {
3544 			cbp = NULL;
3545 			if (head) {
3546 				mutex_enter(&aiop->aio_mutex);
3547 				head->lio_nent--;
3548 				head->lio_refcnt--;
3549 				mutex_exit(&aiop->aio_mutex);
3550 			}
3551 			continue;
3552 		}
3553 
3554 		/* increment file descriptor's ref count. */
3555 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3556 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3557 			if (head) {
3558 				mutex_enter(&aiop->aio_mutex);
3559 				head->lio_nent--;
3560 				head->lio_refcnt--;
3561 				mutex_exit(&aiop->aio_mutex);
3562 			}
3563 			aio_errors++;
3564 			continue;
3565 		}
3566 
3567 		/*
3568 		 * check the permission of the partition
3569 		 */
3570 		if ((fp->f_flag & mode) == 0) {
3571 			releasef(aiocb->aio_fildes);
3572 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3573 			if (head) {
3574 				mutex_enter(&aiop->aio_mutex);
3575 				head->lio_nent--;
3576 				head->lio_refcnt--;
3577 				mutex_exit(&aiop->aio_mutex);
3578 			}
3579 			aio_errors++;
3580 			continue;
3581 		}
3582 
3583 		/*
3584 		 * common case where requests are to the same fd
3585 		 * for the same r/w operation
3586 		 * for UFS, need to set EBADFD
3587 		 */
3588 		vp = fp->f_vnode;
3589 		if (fp != prev_fp || mode != prev_mode) {
3590 			aio_func = check_vp(vp, mode);
3591 			if (aio_func == NULL) {
3592 				prev_fp = NULL;
3593 				releasef(aiocb->aio_fildes);
3594 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
3595 				aio_notsupported++;
3596 				if (head) {
3597 					mutex_enter(&aiop->aio_mutex);
3598 					head->lio_nent--;
3599 					head->lio_refcnt--;
3600 					mutex_exit(&aiop->aio_mutex);
3601 				}
3602 				continue;
3603 			} else {
3604 				prev_fp = fp;
3605 				prev_mode = mode;
3606 			}
3607 		}
3608 
3609 		error = aio_req_setup(&reqp, aiop, aiocb,
3610 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3611 		if (error) {
3612 			releasef(aiocb->aio_fildes);
3613 			lio_set_uerror(&cbp->aio_resultp, error);
3614 			if (head) {
3615 				mutex_enter(&aiop->aio_mutex);
3616 				head->lio_nent--;
3617 				head->lio_refcnt--;
3618 				mutex_exit(&aiop->aio_mutex);
3619 			}
3620 			aio_errors++;
3621 			continue;
3622 		}
3623 
3624 		reqp->aio_req_lio = head;
3625 		deadhead = 0;
3626 
3627 		/*
3628 		 * Set the errno field now before sending the request to
3629 		 * the driver to avoid a race condition
3630 		 */
3631 		(void) suword32(&cbp->aio_resultp.aio_errno,
3632 		    EINPROGRESS);
3633 
3634 		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp;
3635 
3636 		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
3637 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3638 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3639 		if (aio_port | aio_thread) {
3640 			port_kevent_t *lpkevp;
3641 			/*
3642 			 * Prepare data to send with each aiocb completed.
3643 			 */
3644 #ifdef _LP64
3645 			if (aio_port) {
3646 				void *paddr = (void  *)(uintptr_t)
3647 				    aiocb32->aio_sigevent.sigev_value.sival_ptr;
3648 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3649 					error = EFAULT;
3650 			} else {	/* aio_thread */
3651 				pnotify.portnfy_port =
3652 				    aiocb32->aio_sigevent.sigev_signo;
3653 				pnotify.portnfy_user =
3654 				    aiocb32->aio_sigevent.sigev_value.sival_ptr;
3655 			}
3656 #else
3657 			if (aio_port) {
3658 				void *paddr =
3659 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3660 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3661 					error = EFAULT;
3662 			} else {	/* aio_thread */
3663 				pnotify.portnfy_port =
3664 				    aiocb->aio_sigevent.sigev_signo;
3665 				pnotify.portnfy_user =
3666 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3667 			}
3668 #endif
3669 			if (error)
3670 				/* EMPTY */;
3671 			else if (pkevtp != NULL &&
3672 			    pnotify.portnfy_port == lio_head_port)
3673 				error = port_dup_event(pkevtp, &lpkevp,
3674 				    PORT_ALLOC_DEFAULT);
3675 			else
3676 				error = port_alloc_event(pnotify.portnfy_port,
3677 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3678 				    &lpkevp);
3679 			if (error == 0) {
3680 				port_init_event(lpkevp, (uintptr_t)cbp,
3681 				    (void *)(uintptr_t)pnotify.portnfy_user,
3682 				    aio_port_callback, reqp);
3683 				lpkevp->portkev_events = event;
3684 				reqp->aio_req_portkev = lpkevp;
3685 				reqp->aio_req_port = pnotify.portnfy_port;
3686 			}
3687 		}
3688 
3689 		/*
3690 		 * send the request to driver.
3691 		 */
3692 		if (error == 0) {
3693 			if (aiocb->aio_nbytes == 0) {
3694 				clear_active_fd(aiocb->aio_fildes);
3695 				aio_zerolen(reqp);
3696 				continue;
3697 			}
3698 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3699 			    CRED());
3700 		}
3701 
3702 		/*
3703 		 * the fd's ref count is not decremented until the IO has
3704 		 * completed unless there was an error.
3705 		 */
3706 		if (error) {
3707 			releasef(aiocb->aio_fildes);
3708 			lio_set_uerror(&cbp->aio_resultp, error);
3709 			if (head) {
3710 				mutex_enter(&aiop->aio_mutex);
3711 				head->lio_nent--;
3712 				head->lio_refcnt--;
3713 				mutex_exit(&aiop->aio_mutex);
3714 			}
3715 			if (error == ENOTSUP)
3716 				aio_notsupported++;
3717 			else
3718 				aio_errors++;
3719 			lio_set_error(reqp, portused);
3720 		} else {
3721 			clear_active_fd(aiocb->aio_fildes);
3722 		}
3723 	}
3724 
3725 	if (aio_notsupported) {
3726 		error = ENOTSUP;
3727 	} else if (aio_errors) {
3728 		/*
3729 		 * return EIO if any request failed
3730 		 */
3731 		error = EIO;
3732 	}
3733 
3734 	if (mode_arg == LIO_WAIT) {
3735 		mutex_enter(&aiop->aio_mutex);
3736 		while (head->lio_refcnt > 0) {
3737 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3738 				mutex_exit(&aiop->aio_mutex);
3739 				error = EINTR;
3740 				goto done;
3741 			}
3742 		}
3743 		mutex_exit(&aiop->aio_mutex);
3744 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32);
3745 	}
3746 
3747 done:
3748 	kmem_free(cbplist, ssize);
3749 	if (deadhead) {
3750 		if (head->lio_sigqp)
3751 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3752 		if (head->lio_portkev)
3753 			port_free_event(head->lio_portkev);
3754 		kmem_free(head, sizeof (aio_lio_t));
3755 	}
3756 	return (error);
3757 }
3758 
3759 
3760 #ifdef  _SYSCALL32_IMPL
3761 void
3762 aiocb_32ton(aiocb32_t *src, aiocb_t *dest)
3763 {
3764 	dest->aio_fildes = src->aio_fildes;
3765 	dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf;
3766 	dest->aio_nbytes = (size_t)src->aio_nbytes;
3767 	dest->aio_offset = (off_t)src->aio_offset;
3768 	dest->aio_reqprio = src->aio_reqprio;
3769 	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3770 	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3771 
3772 	/*
3773 	 * See comment in sigqueue32() on handling of 32-bit
3774 	 * sigvals in a 64-bit kernel.
3775 	 */
3776 	dest->aio_sigevent.sigev_value.sival_int =
3777 	    (int)src->aio_sigevent.sigev_value.sival_int;
3778 	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3779 	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3780 	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3781 	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3782 	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3783 	dest->aio_lio_opcode = src->aio_lio_opcode;
3784 	dest->aio_state = src->aio_state;
3785 	dest->aio__pad[0] = src->aio__pad[0];
3786 }
3787 #endif /* _SYSCALL32_IMPL */
3788 
3789 /*
3790  * aio_port_callback() is called just before the event is retrieved from the
3791  * port. The task of this callback function is to finish the work of the
3792  * transaction for the application, it means :
3793  * - copyout transaction data to the application
3794  *	(this thread is running in the right process context)
3795  * - keep trace of the transaction (update of counters).
3796  * - free allocated buffers
3797  * The aiocb pointer is the object element of the port_kevent_t structure.
3798  *
3799  * flag :
3800  *	PORT_CALLBACK_DEFAULT : do copyout and free resources
3801  *	PORT_CALLBACK_CLOSE   : don't do copyout, free resources
3802  */
3803 
3804 /*ARGSUSED*/
3805 int
3806 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp)
3807 {
3808 	aio_t		*aiop = curproc->p_aio;
3809 	aio_req_t	*reqp = arg;
3810 	struct	iovec	*iov;
3811 	struct	buf	*bp;
3812 	void		*resultp;
3813 
3814 	if (pid != curproc->p_pid) {
3815 		/* wrong proc !!, can not deliver data here ... */
3816 		return (EACCES);
3817 	}
3818 
3819 	mutex_enter(&aiop->aio_portq_mutex);
3820 	reqp->aio_req_portkev = NULL;
3821 	aio_req_remove_portq(aiop, reqp); /* remove request from portq */
3822 	mutex_exit(&aiop->aio_portq_mutex);
3823 	aphysio_unlock(reqp);		/* unlock used pages */
3824 	mutex_enter(&aiop->aio_mutex);
3825 	if (reqp->aio_req_flags & AIO_COPYOUTDONE) {
3826 		aio_req_free_port(aiop, reqp);	/* back to free list */
3827 		mutex_exit(&aiop->aio_mutex);
3828 		return (0);
3829 	}
3830 
3831 	iov = reqp->aio_req_uio.uio_iov;
3832 	bp = &reqp->aio_req_buf;
3833 	resultp = (void *)reqp->aio_req_resultp;
3834 	aio_req_free_port(aiop, reqp);	/* request struct back to free list */
3835 	mutex_exit(&aiop->aio_mutex);
3836 	if (flag == PORT_CALLBACK_DEFAULT)
3837 		aio_copyout_result_port(iov, bp, resultp);
3838 	return (0);
3839 }
3840