xref: /titanic_44/usr/src/uts/common/os/aio.c (revision ace1a5f11236a072fca1b5e0ea1416a083a9f2aa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Kernel asynchronous I/O.
31  * This is only for raw devices now (as of Nov. 1993).
32  */
33 
34 #include <sys/types.h>
35 #include <sys/errno.h>
36 #include <sys/conf.h>
37 #include <sys/file.h>
38 #include <sys/fs/snode.h>
39 #include <sys/unistd.h>
40 #include <sys/cmn_err.h>
41 #include <vm/as.h>
42 #include <vm/faultcode.h>
43 #include <sys/sysmacros.h>
44 #include <sys/procfs.h>
45 #include <sys/kmem.h>
46 #include <sys/autoconf.h>
47 #include <sys/ddi_impldefs.h>
48 #include <sys/sunddi.h>
49 #include <sys/aio_impl.h>
50 #include <sys/debug.h>
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/vmsystm.h>
54 #include <sys/fs/pxfs_ki.h>
55 #include <sys/contract/process_impl.h>
56 
57 /*
58  * external entry point.
59  */
60 #ifdef _LP64
61 static int64_t kaioc(long, long, long, long, long, long);
62 #endif
63 static int kaio(ulong_t *, rval_t *);
64 
65 
66 #define	AIO_64	0
67 #define	AIO_32	1
68 #define	AIO_LARGEFILE	2
69 
70 /*
71  * implementation specific functions (private)
72  */
73 #ifdef _LP64
74 static int alio(int, int, aiocb_t **, int, struct sigevent *);
75 #endif
76 static int aionotify(void);
77 static int aioinit(void);
78 static int aiostart(void);
79 static void alio_cleanup(aio_t *, aiocb_t **, int, int);
80 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
81     cred_t *);
82 static void lio_set_error(aio_req_t *);
83 static aio_t *aio_aiop_alloc();
84 static int aio_req_alloc(aio_req_t **, aio_result_t *);
85 static int aio_lio_alloc(aio_lio_t **);
86 static aio_req_t *aio_req_done(void *);
87 static aio_req_t *aio_req_remove(aio_req_t *);
88 static int aio_req_find(aio_result_t *, aio_req_t **);
89 static int aio_hash_insert(struct aio_req_t *, aio_t *);
90 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
91     aio_result_t *, int, vnode_t *);
92 static int aio_cleanup_thread(aio_t *);
93 static aio_lio_t *aio_list_get(aio_result_t *);
94 static void lio_set_uerror(void *, int);
95 extern void aio_zerolen(aio_req_t *);
96 static int aiowait(struct timeval *, int, long	*);
97 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
98 static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
99     aio_req_t *reqlist, aio_t *aiop, model_t model);
100 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
101 static int aiosuspend(void *, int, struct  timespec *, int,
102     long	*, int);
103 static int aliowait(int, void *, int, void *, int);
104 static int aioerror(void *, int);
105 static int aio_cancel(int, void *, long	*, int);
106 static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
107 static int aiorw(int, void *, int, int);
108 
109 static int alioLF(int, void *, int, void *);
110 static int aio_req_setupLF(aio_req_t **, aio_t *,
111     aiocb64_32_t *, aio_result_t *, int, vnode_t *);
112 static int alio32(int, void *, int, void *);
113 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
114 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
115 
116 #ifdef  _SYSCALL32_IMPL
117 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
118 void	aiocb_32ton(aiocb32_t *, aiocb_t *);
119 #endif /* _SYSCALL32_IMPL */
120 
121 /*
122  * implementation specific functions (external)
123  */
124 void aio_req_free(aio_t *, aio_req_t *);
125 
126 /*
127  * Event Port framework
128  */
129 
130 void aio_req_free_port(aio_t *, aio_req_t *);
131 static int aio_port_callback(void *, int *, pid_t, int, void *);
132 
133 /*
134  * This is the loadable module wrapper.
135  */
136 #include <sys/modctl.h>
137 #include <sys/syscall.h>
138 
139 #ifdef _LP64
140 
141 static struct sysent kaio_sysent = {
142 	6,
143 	SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
144 	(int (*)())kaioc
145 };
146 
147 #ifdef _SYSCALL32_IMPL
148 static struct sysent kaio_sysent32 = {
149 	7,
150 	SE_NOUNLOAD | SE_64RVAL,
151 	kaio
152 };
153 #endif  /* _SYSCALL32_IMPL */
154 
155 #else   /* _LP64 */
156 
157 static struct sysent kaio_sysent = {
158 	7,
159 	SE_NOUNLOAD | SE_32RVAL1,
160 	kaio
161 };
162 
163 #endif  /* _LP64 */
164 
165 /*
166  * Module linkage information for the kernel.
167  */
168 
169 static struct modlsys modlsys = {
170 	&mod_syscallops,
171 	"kernel Async I/O",
172 	&kaio_sysent
173 };
174 
175 #ifdef  _SYSCALL32_IMPL
176 static struct modlsys modlsys32 = {
177 	&mod_syscallops32,
178 	"kernel Async I/O for 32 bit compatibility",
179 	&kaio_sysent32
180 };
181 #endif  /* _SYSCALL32_IMPL */
182 
183 
184 static struct modlinkage modlinkage = {
185 	MODREV_1,
186 	&modlsys,
187 #ifdef  _SYSCALL32_IMPL
188 	&modlsys32,
189 #endif
190 	NULL
191 };
192 
193 int
194 _init(void)
195 {
196 	int retval;
197 
198 	if ((retval = mod_install(&modlinkage)) != 0)
199 		return (retval);
200 
201 	return (0);
202 }
203 
204 int
205 _fini(void)
206 {
207 	int retval;
208 
209 	retval = mod_remove(&modlinkage);
210 
211 	return (retval);
212 }
213 
214 int
215 _info(struct modinfo *modinfop)
216 {
217 	return (mod_info(&modlinkage, modinfop));
218 }
219 
220 #ifdef	_LP64
221 static int64_t
222 kaioc(
223 	long	a0,
224 	long	a1,
225 	long	a2,
226 	long	a3,
227 	long	a4,
228 	long	a5)
229 {
230 	int	error;
231 	long	rval = 0;
232 
233 	switch ((int)a0 & ~AIO_POLL_BIT) {
234 	case AIOREAD:
235 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
236 		    (offset_t)a4, (aio_result_t *)a5, FREAD);
237 		break;
238 	case AIOWRITE:
239 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
240 		    (offset_t)a4, (aio_result_t *)a5, FWRITE);
241 		break;
242 	case AIOWAIT:
243 		error = aiowait((struct timeval *)a1, (int)a2, &rval);
244 		break;
245 	case AIOWAITN:
246 		error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
247 		    (timespec_t *)a4);
248 		break;
249 	case AIONOTIFY:
250 		error = aionotify();
251 		break;
252 	case AIOINIT:
253 		error = aioinit();
254 		break;
255 	case AIOSTART:
256 		error = aiostart();
257 		break;
258 	case AIOLIO:
259 		error = alio((int)a0, (int)a1, (aiocb_t **)a2, (int)a3,
260 		    (struct sigevent *)a4);
261 		break;
262 	case AIOLIOWAIT:
263 		error = aliowait((int)a1, (void *)a2, (int)a3,
264 		    (struct sigevent *)a4, AIO_64);
265 		break;
266 	case AIOSUSPEND:
267 		error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
268 		    (int)a4, &rval, AIO_64);
269 		break;
270 	case AIOERROR:
271 		error = aioerror((void *)a1, AIO_64);
272 		break;
273 	case AIOAREAD:
274 		error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
275 		break;
276 	case AIOAWRITE:
277 		error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
278 		break;
279 	case AIOCANCEL:
280 		error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
281 		break;
282 
283 	/*
284 	 * The large file related stuff is valid only for
285 	 * 32 bit kernel and not for 64 bit kernel
286 	 * On 64 bit kernel we convert large file calls
287 	 * to regular 64bit calls.
288 	 */
289 
290 	default:
291 		error = EINVAL;
292 	}
293 	if (error)
294 		return ((int64_t)set_errno(error));
295 	return (rval);
296 }
297 #endif
298 
299 static int
300 kaio(
301 	ulong_t *uap,
302 	rval_t *rvp)
303 {
304 	long rval = 0;
305 	int	error = 0;
306 	offset_t	off;
307 
308 
309 		rvp->r_vals = 0;
310 #if defined(_LITTLE_ENDIAN)
311 	off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
312 #else
313 	off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
314 #endif
315 
316 	switch (uap[0] & ~AIO_POLL_BIT) {
317 	/*
318 	 * It must be the 32 bit system call on 64 bit kernel
319 	 */
320 	case AIOREAD:
321 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
322 		    (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
323 	case AIOWRITE:
324 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
325 		    (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
326 	case AIOWAIT:
327 		error = aiowait((struct	timeval *)uap[1], (int)uap[2],
328 		    &rval);
329 		break;
330 	case AIOWAITN:
331 		error = aiowaitn((void *)uap[1], (uint_t)uap[2],
332 		    (uint_t *)uap[3], (timespec_t *)uap[4]);
333 		break;
334 	case AIONOTIFY:
335 		return (aionotify());
336 	case AIOINIT:
337 		return (aioinit());
338 	case AIOSTART:
339 		return (aiostart());
340 	case AIOLIO:
341 		return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
342 		    (void *)uap[4]));
343 	case AIOLIOWAIT:
344 		return (aliowait((int)uap[1], (void *)uap[2],
345 		    (int)uap[3], (struct sigevent *)uap[4], AIO_32));
346 	case AIOSUSPEND:
347 		error = aiosuspend((void *)uap[1], (int)uap[2],
348 		    (timespec_t *)uap[3], (int)uap[4],
349 		    &rval, AIO_32);
350 		break;
351 	case AIOERROR:
352 		return (aioerror((void *)uap[1], AIO_32));
353 	case AIOAREAD:
354 		return (aiorw((int)uap[0], (void *)uap[1],
355 		    FREAD, AIO_32));
356 	case AIOAWRITE:
357 		return (aiorw((int)uap[0], (void *)uap[1],
358 		    FWRITE, AIO_32));
359 	case AIOCANCEL:
360 		error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
361 		    AIO_32));
362 		break;
363 	case AIOLIO64:
364 		return (alioLF((int)uap[1], (void *)uap[2],
365 		    (int)uap[3], (void *)uap[4]));
366 	case AIOLIOWAIT64:
367 		return (aliowait(uap[1], (void *)uap[2],
368 		    (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
369 	case AIOSUSPEND64:
370 		error = aiosuspend((void *)uap[1], (int)uap[2],
371 		    (timespec_t *)uap[3], (int)uap[4], &rval,
372 		    AIO_LARGEFILE);
373 		break;
374 	case AIOERROR64:
375 		return (aioerror((void *)uap[1], AIO_LARGEFILE));
376 	case AIOAREAD64:
377 		return (aiorw((int)uap[0], (void *)uap[1], FREAD,
378 		    AIO_LARGEFILE));
379 	case AIOAWRITE64:
380 		return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
381 		    AIO_LARGEFILE));
382 	case AIOCANCEL64:
383 		error = (aio_cancel((int)uap[1], (void *)uap[2],
384 		    &rval, AIO_LARGEFILE));
385 		break;
386 	default:
387 		return (EINVAL);
388 	}
389 
390 	rvp->r_val1 = rval;
391 	return (error);
392 }
393 
394 /*
395  * wake up LWPs in this process that are sleeping in
396  * aiowait().
397  */
398 static int
399 aionotify(void)
400 {
401 	aio_t	*aiop;
402 
403 	aiop = curproc->p_aio;
404 	if (aiop == NULL)
405 		return (0);
406 
407 	mutex_enter(&aiop->aio_mutex);
408 	aiop->aio_notifycnt++;
409 	cv_broadcast(&aiop->aio_waitcv);
410 	mutex_exit(&aiop->aio_mutex);
411 
412 	return (0);
413 }
414 
415 static int
416 timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
417 	timestruc_t **rqtp, int *blocking)
418 {
419 #ifdef	_SYSCALL32_IMPL
420 	struct timeval32 wait_time_32;
421 #endif
422 	struct timeval wait_time;
423 	model_t	model = get_udatamodel();
424 
425 	*rqtp = NULL;
426 	if (timout == NULL) {		/* wait indefinitely */
427 		*blocking = 1;
428 		return (0);
429 	}
430 
431 	/*
432 	 * Need to correctly compare with the -1 passed in for a user
433 	 * address pointer, with both 32 bit and 64 bit apps.
434 	 */
435 	if (model == DATAMODEL_NATIVE) {
436 		if ((intptr_t)timout == (intptr_t)-1) {	/* don't wait */
437 			*blocking = 0;
438 			return (0);
439 		}
440 
441 		if (copyin(timout, &wait_time, sizeof (wait_time)))
442 			return (EFAULT);
443 	}
444 #ifdef	_SYSCALL32_IMPL
445 	else {
446 		/*
447 		 * -1 from a 32bit app. It will not get sign extended.
448 		 * don't wait if -1.
449 		 */
450 		if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
451 			*blocking = 0;
452 			return (0);
453 		}
454 
455 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
456 			return (EFAULT);
457 		TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
458 	}
459 #endif  /* _SYSCALL32_IMPL */
460 
461 	if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) {	/* don't wait */
462 		*blocking = 0;
463 		return (0);
464 	}
465 
466 	if (wait_time.tv_sec < 0 ||
467 	    wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
468 		return (EINVAL);
469 
470 	rqtime->tv_sec = wait_time.tv_sec;
471 	rqtime->tv_nsec = wait_time.tv_usec * 1000;
472 	*rqtp = rqtime;
473 	*blocking = 1;
474 
475 	return (0);
476 }
477 
478 static int
479 timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
480 	timestruc_t **rqtp, int *blocking)
481 {
482 #ifdef	_SYSCALL32_IMPL
483 	timespec32_t wait_time_32;
484 #endif
485 	model_t	model = get_udatamodel();
486 
487 	*rqtp = NULL;
488 	if (timout == NULL) {
489 		*blocking = 1;
490 		return (0);
491 	}
492 
493 	if (model == DATAMODEL_NATIVE) {
494 		if (copyin(timout, rqtime, sizeof (*rqtime)))
495 			return (EFAULT);
496 	}
497 #ifdef	_SYSCALL32_IMPL
498 	else {
499 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
500 			return (EFAULT);
501 		TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
502 	}
503 #endif  /* _SYSCALL32_IMPL */
504 
505 	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
506 		*blocking = 0;
507 		return (0);
508 	}
509 
510 	if (rqtime->tv_sec < 0 ||
511 	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
512 		return (EINVAL);
513 
514 	*rqtp = rqtime;
515 	*blocking = 1;
516 
517 	return (0);
518 }
519 
520 /*ARGSUSED*/
521 static int
522 aiowait(
523 	struct timeval	*timout,
524 	int	dontblockflg,
525 	long	*rval)
526 {
527 	int 		error;
528 	aio_t		*aiop;
529 	aio_req_t	*reqp;
530 	clock_t		status;
531 	int		blocking;
532 	int		timecheck;
533 	timestruc_t	rqtime;
534 	timestruc_t	*rqtp;
535 
536 	aiop = curproc->p_aio;
537 	if (aiop == NULL)
538 		return (EINVAL);
539 
540 	/*
541 	 * Establish the absolute future time for the timeout.
542 	 */
543 	error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
544 	if (error)
545 		return (error);
546 	if (rqtp) {
547 		timestruc_t now;
548 		timecheck = timechanged;
549 		gethrestime(&now);
550 		timespecadd(rqtp, &now);
551 	}
552 
553 	mutex_enter(&aiop->aio_mutex);
554 	for (;;) {
555 		/* process requests on poll queue */
556 		if (aiop->aio_pollq) {
557 			mutex_exit(&aiop->aio_mutex);
558 			aio_cleanup(0);
559 			mutex_enter(&aiop->aio_mutex);
560 		}
561 		if ((reqp = aio_req_remove(NULL)) != NULL) {
562 			*rval = (long)reqp->aio_req_resultp;
563 			break;
564 		}
565 		/* user-level done queue might not be empty */
566 		if (aiop->aio_notifycnt > 0) {
567 			aiop->aio_notifycnt--;
568 			*rval = 1;
569 			break;
570 		}
571 		/* don't block if no outstanding aio */
572 		if (aiop->aio_outstanding == 0 && dontblockflg) {
573 			error = EINVAL;
574 			break;
575 		}
576 		if (blocking) {
577 			status = cv_waituntil_sig(&aiop->aio_waitcv,
578 			    &aiop->aio_mutex, rqtp, timecheck);
579 
580 			if (status > 0)		/* check done queue again */
581 				continue;
582 			if (status == 0) {	/* interrupted by a signal */
583 				error = EINTR;
584 				*rval = -1;
585 			} else {		/* timer expired */
586 				error = ETIME;
587 			}
588 		}
589 		break;
590 	}
591 	mutex_exit(&aiop->aio_mutex);
592 	if (reqp) {
593 		aphysio_unlock(reqp);
594 		aio_copyout_result(reqp);
595 		mutex_enter(&aiop->aio_mutex);
596 		aio_req_free(aiop, reqp);
597 		mutex_exit(&aiop->aio_mutex);
598 	}
599 	return (error);
600 }
601 
602 /*
603  * aiowaitn can be used to reap completed asynchronous requests submitted with
604  * lio_listio, aio_read or aio_write.
605  * This function only reaps asynchronous raw I/Os.
606  */
607 
608 /*ARGSUSED*/
609 static int
610 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
611 {
612 	int 		error = 0;
613 	aio_t		*aiop;
614 	aio_req_t	*reqlist = NULL;
615 	caddr_t		iocblist = NULL;	/* array of iocb ptr's */
616 	uint_t		waitcnt, cnt = 0;	/* iocb cnt */
617 	size_t		iocbsz;			/* users iocb size */
618 	size_t		riocbsz;		/* returned iocb size */
619 	int		iocb_index = 0;
620 	model_t		model = get_udatamodel();
621 	int		blocking = 1;
622 	int		timecheck;
623 	timestruc_t	rqtime;
624 	timestruc_t	*rqtp;
625 
626 	aiop = curproc->p_aio;
627 	if (aiop == NULL)
628 		return (EINVAL);
629 
630 	if (aiop->aio_outstanding == 0)
631 		return (EAGAIN);
632 
633 	if (copyin(nwait, &waitcnt, sizeof (uint_t)))
634 		return (EFAULT);
635 
636 	/* set *nwait to zero, if we must return prematurely */
637 	if (copyout(&cnt, nwait, sizeof (uint_t)))
638 		return (EFAULT);
639 
640 	if (waitcnt == 0) {
641 		blocking = 0;
642 		rqtp = NULL;
643 		waitcnt = nent;
644 	} else {
645 		error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
646 		if (error)
647 			return (error);
648 	}
649 
650 	if (model == DATAMODEL_NATIVE)
651 		iocbsz = (sizeof (aiocb_t *) * nent);
652 #ifdef	_SYSCALL32_IMPL
653 	else
654 		iocbsz = (sizeof (caddr32_t) * nent);
655 #endif  /* _SYSCALL32_IMPL */
656 
657 	/*
658 	 * Only one aio_waitn call is allowed at a time.
659 	 * The active aio_waitn will collect all requests
660 	 * out of the "done" list and if necessary it will wait
661 	 * for some/all pending requests to fulfill the nwait
662 	 * parameter.
663 	 * A second or further aio_waitn calls will sleep here
664 	 * until the active aio_waitn finishes and leaves the kernel
665 	 * If the second call does not block (poll), then return
666 	 * immediately with the error code : EAGAIN.
667 	 * If the second call should block, then sleep here, but
668 	 * do not touch the timeout. The timeout starts when this
669 	 * aio_waitn-call becomes active.
670 	 */
671 
672 	mutex_enter(&aiop->aio_mutex);
673 
674 	while (aiop->aio_flags & AIO_WAITN) {
675 		if (blocking == 0) {
676 			mutex_exit(&aiop->aio_mutex);
677 			return (EAGAIN);
678 		}
679 
680 		/* block, no timeout */
681 		aiop->aio_flags |= AIO_WAITN_PENDING;
682 		if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
683 			mutex_exit(&aiop->aio_mutex);
684 			return (EINTR);
685 		}
686 	}
687 
688 	/*
689 	 * Establish the absolute future time for the timeout.
690 	 */
691 	if (rqtp) {
692 		timestruc_t now;
693 		timecheck = timechanged;
694 		gethrestime(&now);
695 		timespecadd(rqtp, &now);
696 	}
697 
698 	if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
699 		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
700 		aiop->aio_iocb = NULL;
701 	}
702 
703 	if (aiop->aio_iocb == NULL) {
704 		iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
705 		if (iocblist == NULL) {
706 			mutex_exit(&aiop->aio_mutex);
707 			return (ENOMEM);
708 		}
709 		aiop->aio_iocb = (aiocb_t **)iocblist;
710 		aiop->aio_iocbsz = iocbsz;
711 	} else {
712 		iocblist = (char *)aiop->aio_iocb;
713 	}
714 
715 	aiop->aio_waitncnt = waitcnt;
716 	aiop->aio_flags |= AIO_WAITN;
717 
718 	for (;;) {
719 		/* push requests on poll queue to done queue */
720 		if (aiop->aio_pollq) {
721 			mutex_exit(&aiop->aio_mutex);
722 			aio_cleanup(0);
723 			mutex_enter(&aiop->aio_mutex);
724 		}
725 
726 		/* check for requests on done queue */
727 		if (aiop->aio_doneq) {
728 			cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
729 			aiop->aio_waitncnt = waitcnt - cnt;
730 		}
731 
732 		/* user-level done queue might not be empty */
733 		if (aiop->aio_notifycnt > 0) {
734 			aiop->aio_notifycnt--;
735 			error = 0;
736 			break;
737 		}
738 
739 		/*
740 		 * if we are here second time as a result of timer
741 		 * expiration, we reset error if there are enough
742 		 * aiocb's to satisfy request.
743 		 * We return also if all requests are already done
744 		 * and we picked up the whole done queue.
745 		 */
746 
747 		if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
748 		    aiop->aio_doneq == NULL)) {
749 			error = 0;
750 			break;
751 		}
752 
753 		if ((cnt < waitcnt) && blocking) {
754 			int rval = cv_waituntil_sig(&aiop->aio_waitcv,
755 				&aiop->aio_mutex, rqtp, timecheck);
756 			if (rval > 0)
757 				continue;
758 			if (rval < 0) {
759 				error = ETIME;
760 				blocking = 0;
761 				continue;
762 			}
763 			error = EINTR;
764 		}
765 		break;
766 	}
767 
768 	mutex_exit(&aiop->aio_mutex);
769 
770 	if (cnt > 0) {
771 
772 		iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
773 		    aiop, model);
774 
775 		if (model == DATAMODEL_NATIVE)
776 			riocbsz = (sizeof (aiocb_t *) * cnt);
777 #ifdef	_SYSCALL32_IMPL
778 		else
779 			riocbsz = (sizeof (caddr32_t) * cnt);
780 #endif  /* _SYSCALL32_IMPL */
781 
782 		if (copyout(iocblist, uiocb, riocbsz) ||
783 		    copyout(&cnt, nwait, sizeof (uint_t)))
784 			error = EFAULT;
785 	}
786 
787 	if (aiop->aio_iocbsz > AIO_IOCB_MAX) {
788 		kmem_free(iocblist, aiop->aio_iocbsz);
789 		aiop->aio_iocb = NULL;
790 	}
791 
792 	/* check if there is another thread waiting for execution */
793 	mutex_enter(&aiop->aio_mutex);
794 	aiop->aio_flags &= ~AIO_WAITN;
795 	if (aiop->aio_flags & AIO_WAITN_PENDING) {
796 		aiop->aio_flags &= ~AIO_WAITN_PENDING;
797 		cv_signal(&aiop->aio_waitncv);
798 	}
799 	mutex_exit(&aiop->aio_mutex);
800 
801 	return (error);
802 }
803 
804 /*
805  * aio_unlock_requests
806  * copyouts the result of the request as well as the return value.
807  * It builds the list of completed asynchronous requests,
808  * unlocks the allocated memory ranges and
809  * put the aio request structure back into the free list.
810  */
811 
812 static int
813 aio_unlock_requests(
814 	caddr_t	iocblist,
815 	int	iocb_index,
816 	aio_req_t *reqlist,
817 	aio_t	*aiop,
818 	model_t	model)
819 {
820 	aio_req_t	*reqp, *nreqp;
821 
822 	if (model == DATAMODEL_NATIVE) {
823 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
824 			(((caddr_t *)iocblist)[iocb_index++]) =
825 			    reqp->aio_req_iocb.iocb;
826 			nreqp = reqp->aio_req_next;
827 			aphysio_unlock(reqp);
828 			aio_copyout_result(reqp);
829 			mutex_enter(&aiop->aio_mutex);
830 			aio_req_free(aiop, reqp);
831 			mutex_exit(&aiop->aio_mutex);
832 		}
833 	}
834 #ifdef	_SYSCALL32_IMPL
835 	else {
836 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
837 			((caddr32_t *)iocblist)[iocb_index++] =
838 			    reqp->aio_req_iocb.iocb32;
839 			nreqp = reqp->aio_req_next;
840 			aphysio_unlock(reqp);
841 			aio_copyout_result(reqp);
842 			mutex_enter(&aiop->aio_mutex);
843 			aio_req_free(aiop, reqp);
844 			mutex_exit(&aiop->aio_mutex);
845 		}
846 	}
847 #endif	/* _SYSCALL32_IMPL */
848 	return (iocb_index);
849 }
850 
851 /*
852  * aio_reqlist_concat
853  * moves "max" elements from the done queue to the reqlist queue and removes
854  * the AIO_DONEQ flag.
855  * - reqlist queue is a simple linked list
856  * - done queue is a double linked list
857  */
858 
859 static int
860 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
861 {
862 	aio_req_t *q2, *q2work, *list;
863 	int count = 0;
864 
865 	list = *reqlist;
866 	q2 = aiop->aio_doneq;
867 	q2work = q2;
868 	while (max-- > 0) {
869 		q2work->aio_req_flags &= ~AIO_DONEQ;
870 		q2work = q2work->aio_req_next;
871 		count++;
872 		if (q2work == q2)
873 			break;
874 	}
875 
876 	if (q2work == q2) {
877 		/* all elements revised */
878 		q2->aio_req_prev->aio_req_next = list;
879 		list = q2;
880 		aiop->aio_doneq = NULL;
881 	} else {
882 		/*
883 		 * max < elements in the doneq
884 		 * detach only the required amount of elements
885 		 * out of the doneq
886 		 */
887 		q2work->aio_req_prev->aio_req_next = list;
888 		list = q2;
889 
890 		aiop->aio_doneq = q2work;
891 		q2work->aio_req_prev = q2->aio_req_prev;
892 		q2->aio_req_prev->aio_req_next = q2work;
893 	}
894 	*reqlist = list;
895 	return (count);
896 }
897 
898 /*ARGSUSED*/
899 static int
900 aiosuspend(
901 	void	*aiocb,
902 	int	nent,
903 	struct	timespec	*timout,
904 	int	flag,
905 	long	*rval,
906 	int	run_mode)
907 {
908 	int 		error;
909 	aio_t		*aiop;
910 	aio_req_t	*reqp, *found, *next;
911 	caddr_t		cbplist = NULL;
912 	aiocb_t		*cbp, **ucbp;
913 #ifdef	_SYSCALL32_IMPL
914 	aiocb32_t	*cbp32;
915 	caddr32_t	*ucbp32;
916 #endif  /* _SYSCALL32_IMPL */
917 	aiocb64_32_t	*cbp64;
918 	int		rv;
919 	int		i;
920 	size_t		ssize;
921 	model_t		model = get_udatamodel();
922 	int		blocking;
923 	int		timecheck;
924 	timestruc_t	rqtime;
925 	timestruc_t	*rqtp;
926 
927 	aiop = curproc->p_aio;
928 	if (aiop == NULL || nent <= 0)
929 		return (EINVAL);
930 
931 	/*
932 	 * Establish the absolute future time for the timeout.
933 	 */
934 	error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
935 	if (error)
936 		return (error);
937 	if (rqtp) {
938 		timestruc_t now;
939 		timecheck = timechanged;
940 		gethrestime(&now);
941 		timespecadd(rqtp, &now);
942 	}
943 
944 	/*
945 	 * If we are not blocking and there's no IO complete
946 	 * skip aiocb copyin.
947 	 */
948 	if (!blocking && (aiop->aio_pollq == NULL) &&
949 	    (aiop->aio_doneq == NULL)) {
950 		return (EAGAIN);
951 	}
952 
953 	if (model == DATAMODEL_NATIVE)
954 		ssize = (sizeof (aiocb_t *) * nent);
955 #ifdef	_SYSCALL32_IMPL
956 	else
957 		ssize = (sizeof (caddr32_t) * nent);
958 #endif  /* _SYSCALL32_IMPL */
959 
960 	cbplist = kmem_alloc(ssize, KM_NOSLEEP);
961 	if (cbplist == NULL)
962 		return (ENOMEM);
963 
964 	if (copyin(aiocb, cbplist, ssize)) {
965 		error = EFAULT;
966 		goto done;
967 	}
968 
969 	found = NULL;
970 	/*
971 	 * we need to get the aio_cleanupq_mutex since we call
972 	 * aio_req_done().
973 	 */
974 	mutex_enter(&aiop->aio_cleanupq_mutex);
975 	mutex_enter(&aiop->aio_mutex);
976 	for (;;) {
977 		/* push requests on poll queue to done queue */
978 		if (aiop->aio_pollq) {
979 			mutex_exit(&aiop->aio_mutex);
980 			mutex_exit(&aiop->aio_cleanupq_mutex);
981 			aio_cleanup(0);
982 			mutex_enter(&aiop->aio_cleanupq_mutex);
983 			mutex_enter(&aiop->aio_mutex);
984 		}
985 		/* check for requests on done queue */
986 		if (aiop->aio_doneq) {
987 			if (model == DATAMODEL_NATIVE)
988 				ucbp = (aiocb_t **)cbplist;
989 #ifdef	_SYSCALL32_IMPL
990 			else
991 				ucbp32 = (caddr32_t *)cbplist;
992 #endif  /* _SYSCALL32_IMPL */
993 			for (i = 0; i < nent; i++) {
994 				if (model == DATAMODEL_NATIVE) {
995 					if ((cbp = *ucbp++) == NULL)
996 						continue;
997 					if (run_mode != AIO_LARGEFILE)
998 						reqp = aio_req_done(
999 						    &cbp->aio_resultp);
1000 					else {
1001 						cbp64 = (aiocb64_32_t *)cbp;
1002 						reqp = aio_req_done(
1003 						    &cbp64->aio_resultp);
1004 					}
1005 				}
1006 #ifdef	_SYSCALL32_IMPL
1007 				else {
1008 					if (run_mode == AIO_32) {
1009 						if ((cbp32 =
1010 						    (aiocb32_t *)(uintptr_t)
1011 						    *ucbp32++) == NULL)
1012 							continue;
1013 						reqp = aio_req_done(
1014 						    &cbp32->aio_resultp);
1015 					} else if (run_mode == AIO_LARGEFILE) {
1016 						if ((cbp64 =
1017 						    (aiocb64_32_t *)(uintptr_t)
1018 						    *ucbp32++) == NULL)
1019 							continue;
1020 						    reqp = aio_req_done(
1021 							&cbp64->aio_resultp);
1022 					}
1023 
1024 				}
1025 #endif  /* _SYSCALL32_IMPL */
1026 				if (reqp) {
1027 					reqp->aio_req_next = found;
1028 					found = reqp;
1029 				}
1030 				if (aiop->aio_doneq == NULL)
1031 					break;
1032 			}
1033 			if (found)
1034 				break;
1035 		}
1036 		if (aiop->aio_notifycnt > 0) {
1037 			/*
1038 			 * nothing on the kernel's queue. the user
1039 			 * has notified the kernel that it has items
1040 			 * on a user-level queue.
1041 			 */
1042 			aiop->aio_notifycnt--;
1043 			*rval = 1;
1044 			error = 0;
1045 			break;
1046 		}
1047 		/* don't block if nothing is outstanding */
1048 		if (aiop->aio_outstanding == 0) {
1049 			error = EAGAIN;
1050 			break;
1051 		}
1052 		if (blocking) {
1053 			/*
1054 			 * drop the aio_cleanupq_mutex as we are
1055 			 * going to block.
1056 			 */
1057 			mutex_exit(&aiop->aio_cleanupq_mutex);
1058 			rv = cv_waituntil_sig(&aiop->aio_waitcv,
1059 				&aiop->aio_mutex, rqtp, timecheck);
1060 			/*
1061 			 * we have to drop aio_mutex and
1062 			 * grab it in the right order.
1063 			 */
1064 			mutex_exit(&aiop->aio_mutex);
1065 			mutex_enter(&aiop->aio_cleanupq_mutex);
1066 			mutex_enter(&aiop->aio_mutex);
1067 			if (rv > 0)	/* check done queue again */
1068 				continue;
1069 			if (rv == 0)	/* interrupted by a signal */
1070 				error = EINTR;
1071 			else		/* timer expired */
1072 				error = ETIME;
1073 		} else {
1074 			error = EAGAIN;
1075 		}
1076 		break;
1077 	}
1078 	mutex_exit(&aiop->aio_mutex);
1079 	mutex_exit(&aiop->aio_cleanupq_mutex);
1080 	for (reqp = found; reqp != NULL; reqp = next) {
1081 		next = reqp->aio_req_next;
1082 		aphysio_unlock(reqp);
1083 		aio_copyout_result(reqp);
1084 		mutex_enter(&aiop->aio_mutex);
1085 		aio_req_free(aiop, reqp);
1086 		mutex_exit(&aiop->aio_mutex);
1087 	}
1088 done:
1089 	kmem_free(cbplist, ssize);
1090 	return (error);
1091 }
1092 
1093 /*
1094  * initialize aio by allocating an aio_t struct for this
1095  * process.
1096  */
1097 static int
1098 aioinit(void)
1099 {
1100 	proc_t *p = curproc;
1101 	aio_t *aiop;
1102 	mutex_enter(&p->p_lock);
1103 	if ((aiop = p->p_aio) == NULL) {
1104 		aiop = aio_aiop_alloc();
1105 		p->p_aio = aiop;
1106 	}
1107 	mutex_exit(&p->p_lock);
1108 	if (aiop == NULL)
1109 		return (ENOMEM);
1110 	return (0);
1111 }
1112 
1113 /*
1114  * start a special thread that will cleanup after aio requests
1115  * that are preventing a segment from being unmapped. as_unmap()
1116  * blocks until all phsyio to this segment is completed. this
1117  * doesn't happen until all the pages in this segment are not
1118  * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
1119  * requests still outstanding. this special thread will make sure
1120  * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
1121  *
1122  * this function will return an error if the process has only
1123  * one LWP. the assumption is that the caller is a separate LWP
1124  * that remains blocked in the kernel for the life of this process.
1125  */
1126 static int
1127 aiostart(void)
1128 {
1129 	proc_t *p = curproc;
1130 	aio_t *aiop;
1131 	int first, error = 0;
1132 
1133 	if (p->p_lwpcnt == 1)
1134 		return (EDEADLK);
1135 	mutex_enter(&p->p_lock);
1136 	if ((aiop = p->p_aio) == NULL)
1137 		error = EINVAL;
1138 	else {
1139 		first = aiop->aio_ok;
1140 		if (aiop->aio_ok == 0)
1141 			aiop->aio_ok = 1;
1142 	}
1143 	mutex_exit(&p->p_lock);
1144 	if (error == 0 && first == 0) {
1145 		return (aio_cleanup_thread(aiop));
1146 		/* should return only to exit */
1147 	}
1148 	return (error);
1149 }
1150 
1151 /*
1152  * Associate an aiocb with a port.
1153  * This function is used by aiorw() to associate a transaction with a port.
1154  * Allocate an event port structure (port_alloc_event()) and store the
1155  * delivered user pointer (portnfy_user) in the portkev_user field of the
1156  * port_kevent_t structure..
1157  * The aio_req_portkev pointer in the aio_req_t structure was added to identify
1158  * the port association.
1159  */
1160 
1161 static int
1162 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp, aio_req_t *reqp)
1163 {
1164 	port_kevent_t	*pkevp = NULL;
1165 	int		error;
1166 
1167 	error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
1168 	    PORT_SOURCE_AIO, &pkevp);
1169 	if (error) {
1170 		if ((error == ENOMEM) || (error == EAGAIN))
1171 			error = EAGAIN;
1172 		else
1173 			error = EINVAL;
1174 	} else {
1175 		port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
1176 		    aio_port_callback, reqp);
1177 		reqp->aio_req_portkev = pkevp;
1178 		reqp->aio_req_port = pntfy->portnfy_port;
1179 	}
1180 	return (error);
1181 }
1182 
1183 /*
1184  * Associate an aiocb with a port.
1185  * This function is used by lio_listio() to associate a transaction with a port.
1186  * Allocate an event port structure (port_alloc_event()) and store the
1187  * delivered user pointer (portnfy_user) in the portkev_user field of the
1188  * The aio_req_portkev pointer in the aio_req_t structure was added to identify
1189  * the port association.
1190  * The event port notification can be requested attaching the port_notify_t
1191  * structure to the sigevent argument of lio_listio() or attaching the
1192  * port_notify_t structure to the sigevent structure which is embedded in the
1193  * aiocb.
1194  * The attachement to the global sigevent structure is valid for all aiocbs
1195  * in the list.
1196  */
1197 
1198 static int
1199 aio_req_assoc_port(struct sigevent *sigev, void	*user, aiocb_t *cbp,
1200     aio_req_t *reqp, port_kevent_t *pkevtp)
1201 {
1202 	port_kevent_t	*pkevp = NULL;
1203 	port_notify_t	pntfy;
1204 	int		error;
1205 
1206 	if (sigev->sigev_notify == SIGEV_PORT) {
1207 		/* aiocb has an own port notification embedded */
1208 		if (copyin((void *)sigev->sigev_value.sival_ptr, &pntfy,
1209 		    sizeof (port_notify_t)))
1210 			return (EFAULT);
1211 
1212 		error = port_alloc_event(pntfy.portnfy_port, PORT_ALLOC_DEFAULT,
1213 		    PORT_SOURCE_AIO, &pkevp);
1214 		if (error) {
1215 			if ((error == ENOMEM) || (error == EAGAIN))
1216 				return (EAGAIN);
1217 			else
1218 				return (EINVAL);
1219 		}
1220 		/* use this values instead of the global values in port */
1221 
1222 		port_init_event(pkevp, (uintptr_t)cbp, pntfy.portnfy_user,
1223 		    aio_port_callback, reqp);
1224 		reqp->aio_req_port = pntfy.portnfy_port;
1225 	} else {
1226 		/* use global port notification */
1227 		error = port_dup_event(pkevtp, &pkevp, PORT_ALLOC_DEFAULT);
1228 		if (error)
1229 			return (EAGAIN);
1230 		port_init_event(pkevp, (uintptr_t)cbp, user, aio_port_callback,
1231 		    reqp);
1232 	}
1233 	reqp->aio_req_portkev = pkevp;
1234 	return (0);
1235 }
1236 
1237 /*
1238  * Same comments as in aio_req_assoc_port(), see above.
1239  */
1240 
1241 static int
1242 aio_req_assoc_port32(struct sigevent32 *sigev, void *user, aiocb_t *cbp,
1243     aio_req_t *reqp, port_kevent_t *pkevtp)
1244 {
1245 	port_kevent_t	*pkevp = NULL;
1246 	port_notify32_t	pntfy;
1247 	int		error;
1248 
1249 	if (sigev->sigev_notify == SIGEV_PORT) {
1250 		if (copyin((void *)(uintptr_t)sigev->sigev_value.sival_int,
1251 		    &pntfy, sizeof (port_notify32_t)))
1252 			return (EFAULT);
1253 
1254 		error = port_alloc_event(pntfy.portnfy_port,
1255 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevp);
1256 		if (error) {
1257 			if ((error == ENOMEM) || (error == EAGAIN))
1258 				return (EAGAIN);
1259 			else
1260 				return (EINVAL);
1261 		}
1262 		/* use this values instead of the global values in port */
1263 
1264 		port_init_event(pkevp, (uintptr_t)cbp,
1265 		    (void *)(uintptr_t)pntfy.portnfy_user,
1266 		    aio_port_callback, reqp);
1267 		reqp->aio_req_port = pntfy.portnfy_port;
1268 	} else {
1269 		error = port_dup_event(pkevtp, &pkevp, PORT_ALLOC_DEFAULT);
1270 		if (error)
1271 			return (EAGAIN);
1272 		port_init_event(pkevp, (uintptr_t)cbp, user, aio_port_callback,
1273 		    reqp);
1274 	}
1275 	reqp->aio_req_portkev = pkevp;
1276 	return (0);
1277 }
1278 
1279 
1280 #ifdef _LP64
1281 
1282 /*
1283  * Asynchronous list IO. A chain of aiocb's are copied in
1284  * one at a time. If the aiocb is invalid, it is skipped.
1285  * For each aiocb, the appropriate driver entry point is
1286  * called. Optimize for the common case where the list
1287  * of requests is to the same file descriptor.
1288  *
1289  * One possible optimization is to define a new driver entry
1290  * point that supports a list of IO requests. Whether this
1291  * improves performance depends somewhat on the driver's
1292  * locking strategy. Processing a list could adversely impact
1293  * the driver's interrupt latency.
1294  */
1295 /*ARGSUSED*/
1296 static int
1297 alio(
1298 	int	opcode,
1299 	int	mode_arg,
1300 	aiocb_t	**aiocb_arg,
1301 	int	nent,
1302 	struct	sigevent *sigev)
1303 
1304 {
1305 	file_t		*fp;
1306 	file_t		*prev_fp = NULL;
1307 	int		prev_mode = -1;
1308 	struct vnode	*vp;
1309 	aio_lio_t	*head;
1310 	aio_req_t	*reqp;
1311 	aio_t		*aiop;
1312 	caddr_t		cbplist;
1313 	aiocb_t		*cbp, **ucbp;
1314 	aiocb_t		cb;
1315 	aiocb_t		*aiocb = &cb;
1316 	struct sigevent sigevk;
1317 	sigqueue_t	*sqp;
1318 	int		(*aio_func)();
1319 	int		mode;
1320 	int		error = 0;
1321 	int		aio_errors = 0;
1322 	int		i;
1323 	size_t		ssize;
1324 	int		deadhead = 0;
1325 	int		aio_notsupported = 0;
1326 	int		aio_use_port = 0;
1327 	port_kevent_t	*pkevtp = NULL;
1328 	port_notify_t	pnotify;
1329 
1330 	aiop = curproc->p_aio;
1331 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1332 		return (EINVAL);
1333 
1334 	ssize = (sizeof (aiocb_t *) * nent);
1335 	cbplist = kmem_alloc(ssize, KM_SLEEP);
1336 	ucbp = (aiocb_t **)cbplist;
1337 
1338 	if (copyin(aiocb_arg, cbplist, sizeof (aiocb_t *) * nent)) {
1339 		kmem_free(cbplist, ssize);
1340 		return (EFAULT);
1341 	}
1342 
1343 	if (sigev) {
1344 		if (copyin(sigev, &sigevk, sizeof (struct sigevent))) {
1345 			kmem_free(cbplist, ssize);
1346 			return (EFAULT);
1347 		}
1348 	}
1349 
1350 	/*
1351 	 * a list head should be allocated if notification is
1352 	 * enabled for this list.
1353 	 */
1354 	head = NULL;
1355 
1356 	/* Event Ports  */
1357 
1358 	if (sigev && sigevk.sigev_notify == SIGEV_PORT) {
1359 		/* Use port for completion notification */
1360 		if (copyin(sigevk.sigev_value.sival_ptr, &pnotify,
1361 		    sizeof (port_notify_t))) {
1362 			kmem_free(cbplist, ssize);
1363 			return (EFAULT);
1364 		}
1365 		/* use event ports for the list of aiocbs */
1366 		aio_use_port = 1;
1367 		error = port_alloc_event(pnotify.portnfy_port,
1368 		    PORT_ALLOC_PRIVATE, PORT_SOURCE_AIO, &pkevtp);
1369 		if (error) {
1370 			if ((error == ENOMEM) || (error == EAGAIN))
1371 				error = EAGAIN;
1372 			else
1373 				error = EINVAL;
1374 			kmem_free(cbplist, ssize);
1375 			return (error);
1376 		}
1377 	} else if ((mode_arg == LIO_WAIT) || sigev) {
1378 		mutex_enter(&aiop->aio_mutex);
1379 		error = aio_lio_alloc(&head);
1380 		mutex_exit(&aiop->aio_mutex);
1381 		if (error)
1382 			goto done;
1383 		deadhead = 1;
1384 		head->lio_nent = nent;
1385 		head->lio_refcnt = nent;
1386 		if (sigev && (sigevk.sigev_notify == SIGEV_SIGNAL) &&
1387 		    (sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG)) {
1388 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
1389 			if (sqp == NULL) {
1390 				error = EAGAIN;
1391 				goto done;
1392 			}
1393 			sqp->sq_func = NULL;
1394 			sqp->sq_next = NULL;
1395 			sqp->sq_info.si_code = SI_ASYNCIO;
1396 			sqp->sq_info.si_pid = curproc->p_pid;
1397 			sqp->sq_info.si_ctid = PRCTID(curproc);
1398 			sqp->sq_info.si_zoneid = getzoneid();
1399 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
1400 			sqp->sq_info.si_signo = sigevk.sigev_signo;
1401 			sqp->sq_info.si_value = sigevk.sigev_value;
1402 			head->lio_sigqp = sqp;
1403 		} else {
1404 			head->lio_sigqp = NULL;
1405 		}
1406 	}
1407 
1408 	for (i = 0; i < nent; i++, ucbp++) {
1409 
1410 		cbp = *ucbp;
1411 		/* skip entry if it can't be copied. */
1412 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (aiocb_t))) {
1413 			if (head) {
1414 				mutex_enter(&aiop->aio_mutex);
1415 				head->lio_nent--;
1416 				head->lio_refcnt--;
1417 				mutex_exit(&aiop->aio_mutex);
1418 			}
1419 			continue;
1420 		}
1421 
1422 		/* skip if opcode for aiocb is LIO_NOP */
1423 
1424 		mode = aiocb->aio_lio_opcode;
1425 		if (mode == LIO_NOP) {
1426 			cbp = NULL;
1427 			if (head) {
1428 				mutex_enter(&aiop->aio_mutex);
1429 				head->lio_nent--;
1430 				head->lio_refcnt--;
1431 				mutex_exit(&aiop->aio_mutex);
1432 			}
1433 			continue;
1434 		}
1435 
1436 		/* increment file descriptor's ref count. */
1437 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
1438 			lio_set_uerror(&cbp->aio_resultp, EBADF);
1439 			if (head) {
1440 				mutex_enter(&aiop->aio_mutex);
1441 				head->lio_nent--;
1442 				head->lio_refcnt--;
1443 				mutex_exit(&aiop->aio_mutex);
1444 			}
1445 			aio_errors++;
1446 			continue;
1447 		}
1448 
1449 		vp = fp->f_vnode;
1450 
1451 		/*
1452 		 * check the permission of the partition
1453 		 */
1454 		mode = aiocb->aio_lio_opcode;
1455 		if ((fp->f_flag & mode) == 0) {
1456 			releasef(aiocb->aio_fildes);
1457 			lio_set_uerror(&cbp->aio_resultp, EBADF);
1458 			if (head) {
1459 				mutex_enter(&aiop->aio_mutex);
1460 				head->lio_nent--;
1461 				head->lio_refcnt--;
1462 				mutex_exit(&aiop->aio_mutex);
1463 			}
1464 			aio_errors++;
1465 			continue;
1466 		}
1467 
1468 		/*
1469 		 * common case where requests are to the same fd for the
1470 		 * same r/w operation.
1471 		 * for UFS, need to set EBADFD
1472 		 */
1473 		if ((fp != prev_fp) || (mode != prev_mode)) {
1474 			aio_func = check_vp(vp, mode);
1475 			if (aio_func == NULL) {
1476 				prev_fp = NULL;
1477 				releasef(aiocb->aio_fildes);
1478 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
1479 				aio_notsupported++;
1480 				if (head) {
1481 					mutex_enter(&aiop->aio_mutex);
1482 					head->lio_nent--;
1483 					head->lio_refcnt--;
1484 					mutex_exit(&aiop->aio_mutex);
1485 				}
1486 				continue;
1487 			} else {
1488 				prev_fp = fp;
1489 				prev_mode = mode;
1490 			}
1491 		}
1492 
1493 		if (error = aio_req_setup(&reqp, aiop, aiocb,
1494 		    &cbp->aio_resultp, aio_use_port, vp)) {
1495 			releasef(aiocb->aio_fildes);
1496 			lio_set_uerror(&cbp->aio_resultp, error);
1497 			if (head) {
1498 				mutex_enter(&aiop->aio_mutex);
1499 				head->lio_nent--;
1500 				head->lio_refcnt--;
1501 				mutex_exit(&aiop->aio_mutex);
1502 			}
1503 			aio_errors++;
1504 			continue;
1505 		}
1506 
1507 		reqp->aio_req_lio = head;
1508 		deadhead = 0;
1509 
1510 		/*
1511 		 * Set the errno field now before sending the request to
1512 		 * the driver to avoid a race condition
1513 		 */
1514 		(void) suword32(&cbp->aio_resultp.aio_errno,
1515 		    EINPROGRESS);
1516 
1517 		reqp->aio_req_iocb.iocb = (caddr_t)cbp;
1518 
1519 		if (aio_use_port) {
1520 			reqp->aio_req_port = pnotify.portnfy_port;
1521 			error = aio_req_assoc_port(&aiocb->aio_sigevent,
1522 			    pnotify.portnfy_user, cbp, reqp, pkevtp);
1523 		}
1524 
1525 		/*
1526 		 * send the request to driver.
1527 		 * Clustering: If PXFS vnode, call PXFS function.
1528 		 */
1529 		if (error == 0) {
1530 			if (aiocb->aio_nbytes == 0) {
1531 				clear_active_fd(aiocb->aio_fildes);
1532 				aio_zerolen(reqp);
1533 				continue;
1534 			}
1535 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
1536 			    CRED());
1537 		}
1538 		/*
1539 		 * the fd's ref count is not decremented until the IO has
1540 		 * completed unless there was an error.
1541 		 */
1542 		if (error) {
1543 			releasef(aiocb->aio_fildes);
1544 			lio_set_uerror(&cbp->aio_resultp, error);
1545 			if (head) {
1546 				mutex_enter(&aiop->aio_mutex);
1547 				head->lio_nent--;
1548 				head->lio_refcnt--;
1549 				mutex_exit(&aiop->aio_mutex);
1550 			}
1551 			if (error == ENOTSUP)
1552 				aio_notsupported++;
1553 			else
1554 				aio_errors++;
1555 			lio_set_error(reqp);
1556 		} else {
1557 			clear_active_fd(aiocb->aio_fildes);
1558 		}
1559 	}
1560 
1561 	if (pkevtp)
1562 		port_free_event(pkevtp);
1563 
1564 	if (aio_notsupported) {
1565 		error = ENOTSUP;
1566 	} else if (aio_errors) {
1567 		/*
1568 		 * return EIO if any request failed
1569 		 */
1570 		error = EIO;
1571 	}
1572 
1573 	if (mode_arg == LIO_WAIT) {
1574 		mutex_enter(&aiop->aio_mutex);
1575 		while (head->lio_refcnt > 0) {
1576 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1577 				mutex_exit(&aiop->aio_mutex);
1578 				error = EINTR;
1579 				goto done;
1580 			}
1581 		}
1582 		mutex_exit(&aiop->aio_mutex);
1583 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
1584 	}
1585 
1586 done:
1587 	kmem_free(cbplist, ssize);
1588 	if (deadhead) {
1589 		if (head->lio_sigqp)
1590 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
1591 		kmem_free(head, sizeof (aio_lio_t));
1592 	}
1593 	return (error);
1594 }
1595 
1596 #endif /* _LP64 */
1597 
1598 /*
1599  * Asynchronous list IO.
1600  * If list I/O is called with LIO_WAIT it can still return
1601  * before all the I/O's are completed if a signal is caught
1602  * or if the list include UFS I/O requests. If this happens,
1603  * libaio will call aliowait() to wait for the I/O's to
1604  * complete
1605  */
1606 /*ARGSUSED*/
1607 static int
1608 aliowait(
1609 	int	mode,
1610 	void	*aiocb,
1611 	int	nent,
1612 	void	*sigev,
1613 	int	run_mode)
1614 {
1615 	aio_lio_t	*head;
1616 	aio_t		*aiop;
1617 	caddr_t		cbplist;
1618 	aiocb_t		*cbp, **ucbp;
1619 #ifdef	_SYSCALL32_IMPL
1620 	aiocb32_t	*cbp32;
1621 	caddr32_t	*ucbp32;
1622 	aiocb64_32_t	*cbp64;
1623 #endif
1624 	int		error = 0;
1625 	int		i;
1626 	size_t		ssize = 0;
1627 	model_t		model = get_udatamodel();
1628 
1629 	aiop = curproc->p_aio;
1630 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1631 		return (EINVAL);
1632 
1633 	if (model == DATAMODEL_NATIVE)
1634 		ssize = (sizeof (aiocb_t *) * nent);
1635 #ifdef	_SYSCALL32_IMPL
1636 	else
1637 		ssize = (sizeof (caddr32_t) * nent);
1638 #endif  /* _SYSCALL32_IMPL */
1639 
1640 	if (ssize == 0)
1641 		return (EINVAL);
1642 
1643 	cbplist = kmem_alloc(ssize, KM_SLEEP);
1644 
1645 	if (model == DATAMODEL_NATIVE)
1646 		ucbp = (aiocb_t **)cbplist;
1647 #ifdef	_SYSCALL32_IMPL
1648 	else
1649 		ucbp32 = (caddr32_t *)cbplist;
1650 #endif  /* _SYSCALL32_IMPL */
1651 
1652 	if (copyin(aiocb, cbplist, ssize)) {
1653 		error = EFAULT;
1654 		goto done;
1655 	}
1656 
1657 	/*
1658 	 * To find the list head, we go through the
1659 	 * list of aiocb structs, find the request
1660 	 * its for, then get the list head that reqp
1661 	 * points to
1662 	 */
1663 	head = NULL;
1664 
1665 	for (i = 0; i < nent; i++) {
1666 		if (model == DATAMODEL_NATIVE) {
1667 			/*
1668 			 * Since we are only checking for a NULL pointer
1669 			 * Following should work on both native data sizes
1670 			 * as well as for largefile aiocb.
1671 			 */
1672 			if ((cbp = *ucbp++) == NULL)
1673 				continue;
1674 			if (run_mode != AIO_LARGEFILE)
1675 				if (head = aio_list_get(&cbp->aio_resultp))
1676 					break;
1677 			else {
1678 				/*
1679 				 * This is a case when largefile call is
1680 				 * made on 32 bit kernel.
1681 				 * Treat each pointer as pointer to
1682 				 * aiocb64_32
1683 				 */
1684 				if (head = aio_list_get((aio_result_t *)
1685 				    &(((aiocb64_32_t *)cbp)->aio_resultp)))
1686 					break;
1687 			}
1688 		}
1689 #ifdef	_SYSCALL32_IMPL
1690 		else {
1691 			if (run_mode == AIO_LARGEFILE) {
1692 				if ((cbp64 = (aiocb64_32_t *)
1693 				    (uintptr_t)*ucbp32++) == NULL)
1694 					continue;
1695 				if (head = aio_list_get((aio_result_t *)
1696 				    &cbp64->aio_resultp))
1697 					break;
1698 			} else if (run_mode == AIO_32) {
1699 				if ((cbp32 = (aiocb32_t *)
1700 				    (uintptr_t)*ucbp32++) == NULL)
1701 					continue;
1702 				if (head = aio_list_get((aio_result_t *)
1703 				    &cbp32->aio_resultp))
1704 					break;
1705 			}
1706 		}
1707 #endif	/* _SYSCALL32_IMPL */
1708 	}
1709 
1710 	if (head == NULL) {
1711 		error = EINVAL;
1712 		goto done;
1713 	}
1714 
1715 	mutex_enter(&aiop->aio_mutex);
1716 	while (head->lio_refcnt > 0) {
1717 		if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1718 			mutex_exit(&aiop->aio_mutex);
1719 			error = EINTR;
1720 			goto done;
1721 		}
1722 	}
1723 	mutex_exit(&aiop->aio_mutex);
1724 	alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
1725 done:
1726 	kmem_free(cbplist, ssize);
1727 	return (error);
1728 }
1729 
1730 aio_lio_t *
1731 aio_list_get(aio_result_t *resultp)
1732 {
1733 	aio_lio_t	*head = NULL;
1734 	aio_t		*aiop;
1735 	aio_req_t 	**bucket;
1736 	aio_req_t 	*reqp;
1737 	long		index;
1738 
1739 	aiop = curproc->p_aio;
1740 	if (aiop == NULL)
1741 		return (NULL);
1742 
1743 	if (resultp) {
1744 		index = AIO_HASH(resultp);
1745 		bucket = &aiop->aio_hash[index];
1746 		for (reqp = *bucket; reqp != NULL;
1747 		    reqp = reqp->aio_hash_next) {
1748 			if (reqp->aio_req_resultp == resultp) {
1749 				head = reqp->aio_req_lio;
1750 				return (head);
1751 			}
1752 		}
1753 	}
1754 	return (NULL);
1755 }
1756 
1757 
1758 static void
1759 lio_set_uerror(void *resultp, int error)
1760 {
1761 	/*
1762 	 * the resultp field is a pointer to where the
1763 	 * error should be written out to the user's
1764 	 * aiocb.
1765 	 *
1766 	 */
1767 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1768 		(void) sulword(&((aio_result_t *)resultp)->aio_return,
1769 		    (ssize_t)-1);
1770 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1771 	}
1772 #ifdef	_SYSCALL32_IMPL
1773 	else {
1774 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1775 		    (uint_t)-1);
1776 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1777 	}
1778 #endif  /* _SYSCALL32_IMPL */
1779 }
1780 
1781 /*
1782  * do cleanup completion for all requests in list. memory for
1783  * each request is also freed.
1784  */
1785 static void
1786 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
1787 {
1788 	int i;
1789 	aio_req_t *reqp;
1790 	aio_result_t *resultp;
1791 	aiocb64_32_t	*aiocb_64;
1792 
1793 	for (i = 0; i < nent; i++) {
1794 		if (get_udatamodel() == DATAMODEL_NATIVE) {
1795 			if (cbp[i] == NULL)
1796 				continue;
1797 			if (run_mode == AIO_LARGEFILE) {
1798 				aiocb_64 = (aiocb64_32_t *)cbp[i];
1799 				resultp = (aio_result_t *)&aiocb_64->
1800 				    aio_resultp;
1801 			} else
1802 				resultp = &cbp[i]->aio_resultp;
1803 		}
1804 #ifdef	_SYSCALL32_IMPL
1805 		else {
1806 			aiocb32_t	*aiocb_32;
1807 			caddr32_t	*cbp32;
1808 
1809 			cbp32 = (caddr32_t *)cbp;
1810 			if (cbp32[i] == NULL)
1811 				continue;
1812 			if (run_mode == AIO_32) {
1813 				aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
1814 				resultp = (aio_result_t *)&aiocb_32->
1815 				    aio_resultp;
1816 			} else if (run_mode == AIO_LARGEFILE) {
1817 				aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
1818 				resultp = (aio_result_t *)&aiocb_64->
1819 				    aio_resultp;
1820 			}
1821 		}
1822 #endif  /* _SYSCALL32_IMPL */
1823 		/*
1824 		 * we need to get the aio_cleanupq_mutex since we call
1825 		 * aio_req_done().
1826 		 */
1827 		mutex_enter(&aiop->aio_cleanupq_mutex);
1828 		mutex_enter(&aiop->aio_mutex);
1829 		reqp = aio_req_done(resultp);
1830 		mutex_exit(&aiop->aio_mutex);
1831 		mutex_exit(&aiop->aio_cleanupq_mutex);
1832 		if (reqp != NULL) {
1833 			aphysio_unlock(reqp);
1834 			aio_copyout_result(reqp);
1835 			mutex_enter(&aiop->aio_mutex);
1836 			aio_req_free(aiop, reqp);
1837 			mutex_exit(&aiop->aio_mutex);
1838 		}
1839 	}
1840 }
1841 
1842 /*
1843  * write out the results for an aio request that is
1844  * done.
1845  */
1846 static int
1847 aioerror(void *cb, int run_mode)
1848 {
1849 	aio_result_t *resultp;
1850 	aio_t *aiop;
1851 	aio_req_t *reqp;
1852 	int retval;
1853 
1854 	aiop = curproc->p_aio;
1855 	if (aiop == NULL || cb == NULL)
1856 		return (EINVAL);
1857 
1858 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1859 		if (run_mode == AIO_LARGEFILE)
1860 			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1861 			    aio_resultp;
1862 		else
1863 			resultp = &((aiocb_t *)cb)->aio_resultp;
1864 	}
1865 #ifdef	_SYSCALL32_IMPL
1866 	else {
1867 		if (run_mode == AIO_LARGEFILE)
1868 			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1869 			    aio_resultp;
1870 		else if (run_mode == AIO_32)
1871 			resultp = (aio_result_t *)&((aiocb32_t *)cb)->
1872 			    aio_resultp;
1873 	}
1874 #endif  /* _SYSCALL32_IMPL */
1875 	/*
1876 	 * we need to get the aio_cleanupq_mutex since we call
1877 	 * aio_req_find().
1878 	 */
1879 	mutex_enter(&aiop->aio_cleanupq_mutex);
1880 	mutex_enter(&aiop->aio_mutex);
1881 	retval = aio_req_find(resultp, &reqp);
1882 	mutex_exit(&aiop->aio_mutex);
1883 	mutex_exit(&aiop->aio_cleanupq_mutex);
1884 	if (retval == 0) {
1885 		aphysio_unlock(reqp);
1886 		aio_copyout_result(reqp);
1887 		mutex_enter(&aiop->aio_mutex);
1888 		aio_req_free(aiop, reqp);
1889 		mutex_exit(&aiop->aio_mutex);
1890 		return (0);
1891 	} else if (retval == 1)
1892 		return (EINPROGRESS);
1893 	else if (retval == 2)
1894 		return (EINVAL);
1895 	return (0);
1896 }
1897 
1898 /*
1899  * 	aio_cancel - if no requests outstanding,
1900  *			return AIO_ALLDONE
1901  *			else
1902  *			return AIO_NOTCANCELED
1903  */
1904 static int
1905 aio_cancel(
1906 	int	fildes,
1907 	void 	*cb,
1908 	long	*rval,
1909 	int	run_mode)
1910 {
1911 	aio_t *aiop;
1912 	void *resultp;
1913 	int index;
1914 	aio_req_t **bucket;
1915 	aio_req_t *ent;
1916 
1917 
1918 	/*
1919 	 * Verify valid file descriptor
1920 	 */
1921 	if ((getf(fildes)) == NULL) {
1922 		return (EBADF);
1923 	}
1924 	releasef(fildes);
1925 
1926 	aiop = curproc->p_aio;
1927 	if (aiop == NULL)
1928 		return (EINVAL);
1929 
1930 	if (aiop->aio_outstanding == 0) {
1931 		*rval = AIO_ALLDONE;
1932 		return (0);
1933 	}
1934 
1935 	mutex_enter(&aiop->aio_mutex);
1936 	if (cb != NULL) {
1937 		if (get_udatamodel() == DATAMODEL_NATIVE) {
1938 			if (run_mode == AIO_LARGEFILE)
1939 				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1940 				    ->aio_resultp;
1941 			else
1942 				resultp = &((aiocb_t *)cb)->aio_resultp;
1943 		}
1944 #ifdef	_SYSCALL32_IMPL
1945 		else {
1946 			if (run_mode == AIO_LARGEFILE)
1947 				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1948 				    ->aio_resultp;
1949 			else if (run_mode == AIO_32)
1950 				resultp = (aio_result_t *)&((aiocb32_t *)cb)
1951 				    ->aio_resultp;
1952 		}
1953 #endif  /* _SYSCALL32_IMPL */
1954 		index = AIO_HASH(resultp);
1955 		bucket = &aiop->aio_hash[index];
1956 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1957 			if (ent->aio_req_resultp == resultp) {
1958 				if ((ent->aio_req_flags & AIO_PENDING) == 0) {
1959 					mutex_exit(&aiop->aio_mutex);
1960 					*rval = AIO_ALLDONE;
1961 					return (0);
1962 				}
1963 				mutex_exit(&aiop->aio_mutex);
1964 				*rval = AIO_NOTCANCELED;
1965 				return (0);
1966 			}
1967 		}
1968 		mutex_exit(&aiop->aio_mutex);
1969 		*rval = AIO_ALLDONE;
1970 		return (0);
1971 	}
1972 
1973 	for (index = 0; index < AIO_HASHSZ; index++) {
1974 		bucket = &aiop->aio_hash[index];
1975 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1976 			if (ent->aio_req_fd == fildes) {
1977 				if ((ent->aio_req_flags & AIO_PENDING) != 0) {
1978 					mutex_exit(&aiop->aio_mutex);
1979 					*rval = AIO_NOTCANCELED;
1980 					return (0);
1981 				}
1982 			}
1983 		}
1984 	}
1985 	mutex_exit(&aiop->aio_mutex);
1986 	*rval = AIO_ALLDONE;
1987 	return (0);
1988 }
1989 
1990 /*
1991  * solaris version of asynchronous read and write
1992  */
1993 static int
1994 arw(
1995 	int	opcode,
1996 	int	fdes,
1997 	char	*bufp,
1998 	int	bufsize,
1999 	offset_t	offset,
2000 	aio_result_t	*resultp,
2001 	int		mode)
2002 {
2003 	file_t		*fp;
2004 	int		error;
2005 	struct vnode	*vp;
2006 	aio_req_t	*reqp;
2007 	aio_t		*aiop;
2008 	int		(*aio_func)();
2009 #ifdef _LP64
2010 	aiocb_t		aiocb;
2011 #else
2012 	aiocb64_32_t	aiocb64;
2013 #endif
2014 
2015 	aiop = curproc->p_aio;
2016 	if (aiop == NULL)
2017 		return (EINVAL);
2018 
2019 	if ((fp = getf(fdes)) == NULL) {
2020 		return (EBADF);
2021 	}
2022 
2023 	/*
2024 	 * check the permission of the partition
2025 	 */
2026 	if ((fp->f_flag & mode) == 0) {
2027 		releasef(fdes);
2028 		return (EBADF);
2029 	}
2030 
2031 	vp = fp->f_vnode;
2032 	aio_func = check_vp(vp, mode);
2033 	if (aio_func == NULL) {
2034 		releasef(fdes);
2035 		return (EBADFD);
2036 	}
2037 #ifdef _LP64
2038 	aiocb.aio_fildes = fdes;
2039 	aiocb.aio_buf = bufp;
2040 	aiocb.aio_nbytes = bufsize;
2041 	aiocb.aio_offset = offset;
2042 	aiocb.aio_sigevent.sigev_notify = 0;
2043 	error = aio_req_setup(&reqp, aiop, &aiocb, resultp, 0, vp);
2044 #else
2045 	aiocb64.aio_fildes = fdes;
2046 	aiocb64.aio_buf = (caddr32_t)bufp;
2047 	aiocb64.aio_nbytes = bufsize;
2048 	aiocb64.aio_offset = offset;
2049 	aiocb64.aio_sigevent.sigev_notify = 0;
2050 	error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, 0, vp);
2051 #endif
2052 	if (error) {
2053 		releasef(fdes);
2054 		return (error);
2055 	}
2056 
2057 	/*
2058 	 * enable polling on this request if the opcode has
2059 	 * the AIO poll bit set
2060 	 */
2061 	if (opcode & AIO_POLL_BIT)
2062 		reqp->aio_req_flags |= AIO_POLL;
2063 
2064 	if (bufsize == 0) {
2065 		clear_active_fd(fdes);
2066 		aio_zerolen(reqp);
2067 		return (0);
2068 	}
2069 	/*
2070 	 * send the request to driver.
2071 	 * Clustering: If PXFS vnode, call PXFS function.
2072 	 */
2073 	error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2074 	/*
2075 	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2076 	 * is released by the aio_cleanup_thread() when the IO has
2077 	 * completed.
2078 	 */
2079 	if (error) {
2080 		releasef(fdes);
2081 		mutex_enter(&aiop->aio_mutex);
2082 		aio_req_free(aiop, reqp);
2083 		aiop->aio_pending--;
2084 		if (aiop->aio_flags & AIO_REQ_BLOCK)
2085 			cv_signal(&aiop->aio_cleanupcv);
2086 		mutex_exit(&aiop->aio_mutex);
2087 		return (error);
2088 	}
2089 	clear_active_fd(fdes);
2090 	return (0);
2091 }
2092 
2093 /*
2094  * Take request out of the port pending queue ...
2095  */
2096 
2097 void
2098 aio_deq_port_pending(aio_t *aiop, aio_req_t *reqp)
2099 {
2100 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2101 	if (reqp->aio_req_prev == NULL)
2102 		/* first request */
2103 		aiop->aio_portpending = reqp->aio_req_next;
2104 	else
2105 		reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2106 	if (reqp->aio_req_next != NULL)
2107 		reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2108 }
2109 
2110 /*
2111  * posix version of asynchronous read and write
2112  */
2113 static	int
2114 aiorw(
2115 	int		opcode,
2116 	void		*aiocb_arg,
2117 	int		mode,
2118 	int		run_mode)
2119 {
2120 #ifdef _SYSCALL32_IMPL
2121 	aiocb32_t	aiocb32;
2122 	struct	sigevent32 *sigev32;
2123 	port_notify32_t	pntfy32;
2124 #endif
2125 	aiocb64_32_t	aiocb64;
2126 	aiocb_t		aiocb;
2127 	file_t		*fp;
2128 	int		error, fd;
2129 	size_t		bufsize;
2130 	struct vnode	*vp;
2131 	aio_req_t	*reqp;
2132 	aio_t		*aiop;
2133 	int		(*aio_func)();
2134 	aio_result_t	*resultp;
2135 	struct	sigevent *sigev;
2136 	model_t		model;
2137 	int		aio_use_port = 0;
2138 	port_notify_t	pntfy;
2139 
2140 	model = get_udatamodel();
2141 	aiop = curproc->p_aio;
2142 	if (aiop == NULL)
2143 		return (EINVAL);
2144 
2145 	if (model == DATAMODEL_NATIVE) {
2146 		if (run_mode != AIO_LARGEFILE) {
2147 			if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
2148 				return (EFAULT);
2149 			bufsize = aiocb.aio_nbytes;
2150 			resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
2151 			if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
2152 				return (EBADF);
2153 			}
2154 			sigev = &aiocb.aio_sigevent;
2155 		} else {
2156 			/*
2157 			 * We come here only when we make largefile
2158 			 * call on 32 bit kernel using 32 bit library.
2159 			 */
2160 			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2161 				return (EFAULT);
2162 			bufsize = aiocb64.aio_nbytes;
2163 			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2164 			    ->aio_resultp);
2165 			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) {
2166 				return (EBADF);
2167 			}
2168 			sigev = (struct sigevent *)&aiocb64.aio_sigevent;
2169 		}
2170 
2171 		if (sigev->sigev_notify == SIGEV_PORT) {
2172 			if (copyin((void *)sigev->sigev_value.sival_ptr,
2173 			    &pntfy, sizeof (port_notify_t))) {
2174 				releasef(fd);
2175 				return (EFAULT);
2176 			}
2177 			aio_use_port = 1;
2178 		}
2179 	}
2180 #ifdef	_SYSCALL32_IMPL
2181 	else {
2182 		if (run_mode == AIO_32) {
2183 			/* 32 bit system call is being made on 64 bit kernel */
2184 			if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
2185 				return (EFAULT);
2186 
2187 			bufsize = aiocb32.aio_nbytes;
2188 			aiocb_32ton(&aiocb32, &aiocb);
2189 			resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
2190 			    aio_resultp);
2191 			if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
2192 				return (EBADF);
2193 			}
2194 			sigev32 = &aiocb32.aio_sigevent;
2195 		} else if (run_mode == AIO_LARGEFILE) {
2196 			/*
2197 			 * We come here only when we make largefile
2198 			 * call on 64 bit kernel using 32 bit library.
2199 			 */
2200 			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2201 				return (EFAULT);
2202 			bufsize = aiocb64.aio_nbytes;
2203 			aiocb_LFton(&aiocb64, &aiocb);
2204 			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2205 			    ->aio_resultp);
2206 			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2207 				return (EBADF);
2208 			sigev32 = &aiocb64.aio_sigevent;
2209 		}
2210 
2211 		if (sigev32->sigev_notify == SIGEV_PORT) {
2212 			if (copyin(
2213 			    (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
2214 			    &pntfy32, sizeof (port_notify32_t))) {
2215 				releasef(fd);
2216 				return (EFAULT);
2217 			}
2218 			pntfy.portnfy_port = pntfy32.portnfy_port;
2219 			pntfy.portnfy_user =
2220 			    (void *)(uintptr_t)pntfy32.portnfy_user;
2221 			aio_use_port = 1;
2222 		}
2223 	}
2224 #endif  /* _SYSCALL32_IMPL */
2225 
2226 	/*
2227 	 * check the permission of the partition
2228 	 */
2229 
2230 	if ((fp->f_flag & mode) == 0) {
2231 		releasef(fd);
2232 		return (EBADF);
2233 	}
2234 
2235 	vp = fp->f_vnode;
2236 	aio_func = check_vp(vp, mode);
2237 	if (aio_func == NULL) {
2238 		releasef(fd);
2239 		return (EBADFD);
2240 	}
2241 	if ((model == DATAMODEL_NATIVE) && (run_mode == AIO_LARGEFILE))
2242 		error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp,
2243 		    aio_use_port, vp);
2244 	else
2245 		error = aio_req_setup(&reqp, aiop, &aiocb, resultp,
2246 		    aio_use_port, vp);
2247 
2248 	if (error) {
2249 		releasef(fd);
2250 		return (error);
2251 	}
2252 	/*
2253 	 * enable polling on this request if the opcode has
2254 	 * the AIO poll bit set
2255 	 */
2256 	if (opcode & AIO_POLL_BIT)
2257 		reqp->aio_req_flags |= AIO_POLL;
2258 
2259 	if (model == DATAMODEL_NATIVE)
2260 		reqp->aio_req_iocb.iocb = aiocb_arg;
2261 #ifdef  _SYSCALL32_IMPL
2262 	else
2263 		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
2264 #endif
2265 
2266 	if (aio_use_port)
2267 		error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp);
2268 
2269 	/*
2270 	 * send the request to driver.
2271 	 * Clustering: If PXFS vnode, call PXFS function.
2272 	 */
2273 	if (error == 0) {
2274 		if (bufsize == 0) {
2275 			clear_active_fd(fd);
2276 			aio_zerolen(reqp);
2277 			return (0);
2278 		}
2279 		error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2280 	}
2281 
2282 	/*
2283 	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2284 	 * is released by the aio_cleanup_thread() when the IO has
2285 	 * completed.
2286 	 */
2287 	if (error) {
2288 		releasef(fd);
2289 		mutex_enter(&aiop->aio_mutex);
2290 		aio_deq_port_pending(aiop, reqp);
2291 		aio_req_free(aiop, reqp);
2292 		aiop->aio_pending--;
2293 		if (aiop->aio_flags & AIO_REQ_BLOCK)
2294 			cv_signal(&aiop->aio_cleanupcv);
2295 		mutex_exit(&aiop->aio_mutex);
2296 		return (error);
2297 	}
2298 	clear_active_fd(fd);
2299 	return (0);
2300 }
2301 
2302 
2303 /*
2304  * set error for a list IO entry that failed.
2305  */
2306 static void
2307 lio_set_error(aio_req_t *reqp)
2308 {
2309 	aio_t *aiop = curproc->p_aio;
2310 
2311 	if (aiop == NULL)
2312 		return;
2313 
2314 	mutex_enter(&aiop->aio_mutex);
2315 	aio_deq_port_pending(aiop, reqp);
2316 	aiop->aio_pending--;
2317 	/* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
2318 	reqp->aio_req_flags |= AIO_PHYSIODONE;
2319 	/*
2320 	 * Need to free the request now as its never
2321 	 * going to get on the done queue
2322 	 *
2323 	 * Note: aio_outstanding is decremented in
2324 	 *	 aio_req_free()
2325 	 */
2326 	aio_req_free(aiop, reqp);
2327 	if (aiop->aio_flags & AIO_REQ_BLOCK)
2328 		cv_signal(&aiop->aio_cleanupcv);
2329 	mutex_exit(&aiop->aio_mutex);
2330 }
2331 
2332 /*
2333  * check if a specified request is done, and remove it from
2334  * the done queue. otherwise remove anybody from the done queue
2335  * if NULL is specified.
2336  */
2337 static aio_req_t *
2338 aio_req_done(void *resultp)
2339 {
2340 	aio_req_t **bucket;
2341 	aio_req_t *ent;
2342 	aio_t *aiop = curproc->p_aio;
2343 	long index;
2344 
2345 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2346 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2347 
2348 	if (resultp) {
2349 		index = AIO_HASH(resultp);
2350 		bucket = &aiop->aio_hash[index];
2351 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2352 			if (ent->aio_req_resultp == (aio_result_t *)resultp) {
2353 				if (ent->aio_req_flags & AIO_DONEQ) {
2354 					return (aio_req_remove(ent));
2355 				}
2356 				return (NULL);
2357 			}
2358 		}
2359 		/* no match, resultp is invalid */
2360 		return (NULL);
2361 	}
2362 	return (aio_req_remove(NULL));
2363 }
2364 
2365 /*
2366  * determine if a user-level resultp pointer is associated with an
2367  * active IO request. Zero is returned when the request is done,
2368  * and the request is removed from the done queue. Only when the
2369  * return value is zero, is the "reqp" pointer valid. One is returned
2370  * when the request is inprogress. Two is returned when the request
2371  * is invalid.
2372  */
2373 static int
2374 aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
2375 {
2376 	aio_req_t **bucket;
2377 	aio_req_t *ent;
2378 	aio_t *aiop = curproc->p_aio;
2379 	long index;
2380 
2381 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2382 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2383 
2384 	index = AIO_HASH(resultp);
2385 	bucket = &aiop->aio_hash[index];
2386 	for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2387 		if (ent->aio_req_resultp == resultp) {
2388 			if (ent->aio_req_flags & AIO_DONEQ) {
2389 				*reqp = aio_req_remove(ent);
2390 				return (0);
2391 			}
2392 			return (1);
2393 		}
2394 	}
2395 	/* no match, resultp is invalid */
2396 	return (2);
2397 }
2398 
2399 /*
2400  * remove a request from the done queue.
2401  */
2402 static aio_req_t *
2403 aio_req_remove(aio_req_t *reqp)
2404 {
2405 	aio_t *aiop = curproc->p_aio;
2406 	aio_req_t *head;
2407 
2408 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2409 
2410 	if (reqp) {
2411 		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2412 		if (reqp->aio_req_next == reqp) {
2413 			/* only one request on queue */
2414 			if (reqp ==  aiop->aio_doneq) {
2415 				aiop->aio_doneq = NULL;
2416 			} else {
2417 				ASSERT(reqp == aiop->aio_cleanupq);
2418 				aiop->aio_cleanupq = NULL;
2419 			}
2420 		} else {
2421 			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2422 			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2423 			/*
2424 			 * The request can be either on the aio_doneq or the
2425 			 * aio_cleanupq
2426 			 */
2427 			if (reqp == aiop->aio_doneq)
2428 				aiop->aio_doneq = reqp->aio_req_next;
2429 
2430 			if (reqp == aiop->aio_cleanupq)
2431 				aiop->aio_cleanupq = reqp->aio_req_next;
2432 		}
2433 		reqp->aio_req_flags &= ~AIO_DONEQ;
2434 		return (reqp);
2435 	}
2436 
2437 	if (aiop->aio_doneq) {
2438 		head = aiop->aio_doneq;
2439 		ASSERT(head->aio_req_flags & AIO_DONEQ);
2440 		if (head == head->aio_req_next) {
2441 			/* only one request on queue */
2442 			aiop->aio_doneq = NULL;
2443 		} else {
2444 			head->aio_req_prev->aio_req_next = head->aio_req_next;
2445 			head->aio_req_next->aio_req_prev = head->aio_req_prev;
2446 			aiop->aio_doneq = head->aio_req_next;
2447 		}
2448 		head->aio_req_flags &= ~AIO_DONEQ;
2449 		return (head);
2450 	}
2451 	return (NULL);
2452 }
2453 
2454 static int
2455 aio_req_setup(
2456 	aio_req_t	**reqpp,
2457 	aio_t 		*aiop,
2458 	aiocb_t 	*arg,
2459 	aio_result_t 	*resultp,
2460 	int		port,
2461 	vnode_t		*vp)
2462 {
2463 	aio_req_t 	*reqp;
2464 	sigqueue_t	*sqp;
2465 	struct uio 	*uio;
2466 
2467 	struct sigevent *sigev;
2468 	int		error;
2469 
2470 	sigev = &arg->aio_sigevent;
2471 	if ((sigev->sigev_notify == SIGEV_SIGNAL) &&
2472 	    (sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG)) {
2473 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2474 		if (sqp == NULL)
2475 			return (EAGAIN);
2476 		sqp->sq_func = NULL;
2477 		sqp->sq_next = NULL;
2478 		sqp->sq_info.si_code = SI_ASYNCIO;
2479 		sqp->sq_info.si_pid = curproc->p_pid;
2480 		sqp->sq_info.si_ctid = PRCTID(curproc);
2481 		sqp->sq_info.si_zoneid = getzoneid();
2482 		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2483 		sqp->sq_info.si_signo = sigev->sigev_signo;
2484 		sqp->sq_info.si_value = sigev->sigev_value;
2485 	} else
2486 		sqp = NULL;
2487 
2488 	mutex_enter(&aiop->aio_mutex);
2489 
2490 	if (aiop->aio_flags & AIO_REQ_BLOCK) {
2491 		mutex_exit(&aiop->aio_mutex);
2492 		if (sqp)
2493 			kmem_free(sqp, sizeof (sigqueue_t));
2494 		return (EIO);
2495 	}
2496 	/*
2497 	 * get an aio_reqp from the free list or allocate one
2498 	 * from dynamic memory.
2499 	 */
2500 	if (error = aio_req_alloc(&reqp, resultp)) {
2501 		mutex_exit(&aiop->aio_mutex);
2502 		if (sqp)
2503 			kmem_free(sqp, sizeof (sigqueue_t));
2504 		return (error);
2505 	}
2506 	aiop->aio_pending++;
2507 	aiop->aio_outstanding++;
2508 	reqp->aio_req_flags = AIO_PENDING;
2509 	if (port)
2510 		aio_enq_port_pending(aiop, reqp);
2511 	mutex_exit(&aiop->aio_mutex);
2512 	/*
2513 	 * initialize aio request.
2514 	 */
2515 	reqp->aio_req_fd = arg->aio_fildes;
2516 	reqp->aio_req_sigqp = sqp;
2517 	reqp->aio_req_iocb.iocb = NULL;
2518 	reqp->aio_req_buf.b_file = vp;
2519 	uio = reqp->aio_req.aio_uio;
2520 	uio->uio_iovcnt = 1;
2521 	uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
2522 	uio->uio_iov->iov_len = arg->aio_nbytes;
2523 	uio->uio_loffset = arg->aio_offset;
2524 	*reqpp = reqp;
2525 	return (0);
2526 }
2527 
2528 /*
2529  * Allocate p_aio struct.
2530  */
2531 static aio_t *
2532 aio_aiop_alloc(void)
2533 {
2534 	aio_t	*aiop;
2535 
2536 	ASSERT(MUTEX_HELD(&curproc->p_lock));
2537 
2538 	aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
2539 	if (aiop) {
2540 		mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
2541 		mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
2542 									NULL);
2543 		mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
2544 	}
2545 	return (aiop);
2546 }
2547 
2548 /*
2549  * Allocate an aio_req struct.
2550  */
2551 static int
2552 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
2553 {
2554 	aio_req_t *reqp;
2555 	aio_t *aiop = curproc->p_aio;
2556 
2557 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2558 
2559 	if ((reqp = aiop->aio_free) != NULL) {
2560 		reqp->aio_req_flags = 0;
2561 		aiop->aio_free = reqp->aio_req_next;
2562 		/*
2563 		 * Clustering:This field has to be specifically
2564 		 * set to null so that the right thing can be
2565 		 * done in aphysio()
2566 		 */
2567 		reqp->aio_req_buf.b_iodone = NULL;
2568 	} else {
2569 		/*
2570 		 * Check whether memory is getting tight.
2571 		 * This is a temporary mechanism to avoid memory
2572 		 * exhaustion by a single process until we come up
2573 		 * with a per process solution such as setrlimit().
2574 		 */
2575 		if (freemem < desfree)
2576 			return (EAGAIN);
2577 
2578 		reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
2579 		if (reqp == NULL)
2580 			return (EAGAIN);
2581 		reqp->aio_req.aio_uio = &(reqp->aio_req_uio);
2582 		reqp->aio_req.aio_uio->uio_iov = &(reqp->aio_req_iov);
2583 		reqp->aio_req.aio_private = reqp;
2584 	}
2585 
2586 	reqp->aio_req_buf.b_offset = -1;
2587 	reqp->aio_req_resultp = resultp;
2588 	if (aio_hash_insert(reqp, aiop)) {
2589 		reqp->aio_req_next = aiop->aio_free;
2590 		aiop->aio_free = reqp;
2591 		return (EINVAL);
2592 	}
2593 	*nreqp = reqp;
2594 	return (0);
2595 }
2596 
2597 /*
2598  * Allocate an aio_lio_t struct.
2599  */
2600 static int
2601 aio_lio_alloc(aio_lio_t **head)
2602 {
2603 	aio_lio_t *liop;
2604 	aio_t *aiop = curproc->p_aio;
2605 
2606 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2607 
2608 	if ((liop = aiop->aio_lio_free) != NULL) {
2609 		aiop->aio_lio_free = liop->lio_next;
2610 	} else {
2611 		/*
2612 		 * Check whether memory is getting tight.
2613 		 * This is a temporary mechanism to avoid memory
2614 		 * exhaustion by a single process until we come up
2615 		 * with a per process solution such as setrlimit().
2616 		 */
2617 		if (freemem < desfree)
2618 			return (EAGAIN);
2619 
2620 		liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
2621 		if (liop == NULL)
2622 			return (EAGAIN);
2623 	}
2624 	*head = liop;
2625 	return (0);
2626 }
2627 
2628 /*
2629  * this is a special per-process thread that is only activated if
2630  * the process is unmapping a segment with outstanding aio. normally,
2631  * the process will have completed the aio before unmapping the
2632  * segment. If the process does unmap a segment with outstanding aio,
2633  * this special thread will guarentee that the locked pages due to
2634  * aphysio() are released, thereby permitting the segment to be
2635  * unmapped. In addition to this, the cleanup thread is woken up
2636  * during DR operations to release the locked pages.
2637  */
2638 
2639 static int
2640 aio_cleanup_thread(aio_t *aiop)
2641 {
2642 	proc_t *p = curproc;
2643 	struct as *as = p->p_as;
2644 	int poked = 0;
2645 	kcondvar_t *cvp;
2646 	int exit_flag = 0;
2647 	int rqclnup = 0;
2648 
2649 	sigfillset(&curthread->t_hold);
2650 	sigdiffset(&curthread->t_hold, &cantmask);
2651 	for (;;) {
2652 		/*
2653 		 * if a segment is being unmapped, and the current
2654 		 * process's done queue is not empty, then every request
2655 		 * on the doneq with locked resources should be forced
2656 		 * to release their locks. By moving the doneq request
2657 		 * to the cleanupq, aio_cleanup() will process the cleanupq,
2658 		 * and place requests back onto the doneq. All requests
2659 		 * processed by aio_cleanup() will have their physical
2660 		 * resources unlocked.
2661 		 */
2662 		mutex_enter(&aiop->aio_mutex);
2663 		if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
2664 			aiop->aio_flags |= AIO_CLEANUP;
2665 			mutex_enter(&as->a_contents);
2666 			if (aiop->aio_rqclnup) {
2667 				aiop->aio_rqclnup = 0;
2668 				rqclnup = 1;
2669 			}
2670 
2671 			if ((rqclnup || AS_ISUNMAPWAIT(as)) &&
2672 					aiop->aio_doneq) {
2673 				aio_req_t *doneqhead = aiop->aio_doneq;
2674 				mutex_exit(&as->a_contents);
2675 				aiop->aio_doneq = NULL;
2676 				aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
2677 			} else {
2678 				mutex_exit(&as->a_contents);
2679 			}
2680 		}
2681 		mutex_exit(&aiop->aio_mutex);
2682 		aio_cleanup(AIO_CLEANUP_THREAD);
2683 		/*
2684 		 * thread should block on the cleanupcv while
2685 		 * AIO_CLEANUP is set.
2686 		 */
2687 		cvp = &aiop->aio_cleanupcv;
2688 		mutex_enter(&aiop->aio_mutex);
2689 
2690 		if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
2691 		    aiop->aio_notifyq != NULL ||
2692 		    aiop->aio_portcleanupq != NULL) {
2693 			mutex_exit(&aiop->aio_mutex);
2694 			continue;
2695 		}
2696 		mutex_enter(&as->a_contents);
2697 
2698 		/*
2699 		 * AIO_CLEANUP determines when the cleanup thread
2700 		 * should be active. This flag is set when
2701 		 * the cleanup thread is awakened by as_unmap() or
2702 		 * due to DR operations.
2703 		 * The flag is cleared when the blocking as_unmap()
2704 		 * that originally awakened us is allowed to
2705 		 * complete. as_unmap() blocks when trying to
2706 		 * unmap a segment that has SOFTLOCKed pages. when
2707 		 * the segment's pages are all SOFTUNLOCKed,
2708 		 * as->a_flags & AS_UNMAPWAIT should be zero.
2709 		 *
2710 		 * In case of cleanup request by DR, the flag is cleared
2711 		 * once all the pending aio requests have been processed.
2712 		 *
2713 		 * The flag shouldn't be cleared right away if the
2714 		 * cleanup thread was interrupted because the process
2715 		 * is doing forkall(). This happens when cv_wait_sig()
2716 		 * returns zero, because it was awakened by a pokelwps().
2717 		 * If the process is not exiting, it must be doing forkall().
2718 		 */
2719 		if ((poked == 0) &&
2720 			((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) ||
2721 					(aiop->aio_pending == 0))) {
2722 			aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
2723 			cvp = &as->a_cv;
2724 			rqclnup = 0;
2725 		}
2726 		mutex_exit(&aiop->aio_mutex);
2727 		if (poked) {
2728 			/*
2729 			 * If the process is exiting/killed, don't return
2730 			 * immediately without waiting for pending I/O's
2731 			 * and releasing the page locks.
2732 			 */
2733 			if (p->p_flag & (SEXITLWPS|SKILLED)) {
2734 				/*
2735 				 * If exit_flag is set, then it is
2736 				 * safe to exit because we have released
2737 				 * page locks of completed I/O's.
2738 				 */
2739 				if (exit_flag)
2740 					break;
2741 
2742 				mutex_exit(&as->a_contents);
2743 
2744 				/*
2745 				 * Wait for all the pending aio to complete.
2746 				 */
2747 				mutex_enter(&aiop->aio_mutex);
2748 				aiop->aio_flags |= AIO_REQ_BLOCK;
2749 				while (aiop->aio_pending != 0)
2750 					cv_wait(&aiop->aio_cleanupcv,
2751 						&aiop->aio_mutex);
2752 				mutex_exit(&aiop->aio_mutex);
2753 				exit_flag = 1;
2754 				continue;
2755 			} else if (p->p_flag &
2756 			    (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
2757 				/*
2758 				 * hold LWP until it
2759 				 * is continued.
2760 				 */
2761 				mutex_exit(&as->a_contents);
2762 				mutex_enter(&p->p_lock);
2763 				stop(PR_SUSPENDED, SUSPEND_NORMAL);
2764 				mutex_exit(&p->p_lock);
2765 				poked = 0;
2766 				continue;
2767 			}
2768 		} else {
2769 			/*
2770 			 * When started this thread will sleep on as->a_cv.
2771 			 * as_unmap will awake this thread if the
2772 			 * segment has SOFTLOCKed pages (poked = 0).
2773 			 * 1. pokelwps() awakes this thread =>
2774 			 *    break the loop to check SEXITLWPS, SHOLDFORK, etc
2775 			 * 2. as_unmap awakes this thread =>
2776 			 *    to break the loop it is necessary that
2777 			 *    - AS_UNMAPWAIT is set (as_unmap is waiting for
2778 			 *	memory to be unlocked)
2779 			 *    - AIO_CLEANUP is not set
2780 			 *	(if AIO_CLEANUP is set we have to wait for
2781 			 *	pending requests. aio_done will send a signal
2782 			 *	for every request which completes to continue
2783 			 *	unmapping the corresponding address range)
2784 			 * 3. A cleanup request will wake this thread up, ex.
2785 			 *    by the DR operations. The aio_rqclnup flag will
2786 			 *    be set.
2787 			 */
2788 			while (poked == 0) {
2789 				/*
2790 				 * we need to handle cleanup requests
2791 				 * that come in after we had just cleaned up,
2792 				 * so that we do cleanup of any new aio
2793 				 * requests that got completed and have
2794 				 * locked resources.
2795 				 */
2796 				if ((aiop->aio_rqclnup ||
2797 					(AS_ISUNMAPWAIT(as) != 0)) &&
2798 					(aiop->aio_flags & AIO_CLEANUP) == 0)
2799 					break;
2800 				poked = !cv_wait_sig(cvp, &as->a_contents);
2801 				if (AS_ISUNMAPWAIT(as) == 0)
2802 					cv_signal(cvp);
2803 				if (aiop->aio_outstanding != 0)
2804 					break;
2805 			}
2806 		}
2807 		mutex_exit(&as->a_contents);
2808 	}
2809 exit:
2810 	mutex_exit(&as->a_contents);
2811 	ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
2812 	aston(curthread);	/* make thread do post_syscall */
2813 	return (0);
2814 }
2815 
2816 /*
2817  * save a reference to a user's outstanding aio in a hash list.
2818  */
2819 static int
2820 aio_hash_insert(
2821 	aio_req_t *aio_reqp,
2822 	aio_t *aiop)
2823 {
2824 	long index;
2825 	aio_result_t *resultp = aio_reqp->aio_req_resultp;
2826 	aio_req_t *current;
2827 	aio_req_t **nextp;
2828 
2829 	index = AIO_HASH(resultp);
2830 	nextp = &aiop->aio_hash[index];
2831 	while ((current = *nextp) != NULL) {
2832 		if (current->aio_req_resultp == resultp)
2833 			return (DUPLICATE);
2834 		nextp = &current->aio_hash_next;
2835 	}
2836 	*nextp = aio_reqp;
2837 	aio_reqp->aio_hash_next = NULL;
2838 	return (0);
2839 }
2840 
2841 static int
2842 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
2843     cred_t *)
2844 {
2845 	struct snode *sp;
2846 	dev_t		dev;
2847 	struct cb_ops  	*cb;
2848 	major_t		major;
2849 	int		(*aio_func)();
2850 
2851 	dev = vp->v_rdev;
2852 	major = getmajor(dev);
2853 
2854 	/*
2855 	 * return NULL for requests to files and STREAMs so
2856 	 * that libaio takes care of them.
2857 	 */
2858 	if (vp->v_type == VCHR) {
2859 		/* no stream device for kaio */
2860 		if (STREAMSTAB(major)) {
2861 			return (NULL);
2862 		}
2863 	} else {
2864 		return (NULL);
2865 	}
2866 
2867 	/*
2868 	 * Check old drivers which do not have async I/O entry points.
2869 	 */
2870 	if (devopsp[major]->devo_rev < 3)
2871 		return (NULL);
2872 
2873 	cb = devopsp[major]->devo_cb_ops;
2874 
2875 	if (cb->cb_rev < 1)
2876 		return (NULL);
2877 
2878 	/*
2879 	 * Check whether this device is a block device.
2880 	 * Kaio is not supported for devices like tty.
2881 	 */
2882 	if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
2883 		return (NULL);
2884 
2885 	/*
2886 	 * Clustering: If vnode is a PXFS vnode, then the device may be remote.
2887 	 * We cannot call the driver directly. Instead return the
2888 	 * PXFS functions.
2889 	 */
2890 
2891 	if (IS_PXFSVP(vp)) {
2892 		if (mode & FREAD)
2893 			return (clpxfs_aio_read);
2894 		else
2895 			return (clpxfs_aio_write);
2896 	}
2897 	if (mode & FREAD)
2898 		aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
2899 	else
2900 		aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;
2901 
2902 	/*
2903 	 * Do we need this ?
2904 	 * nodev returns ENXIO anyway.
2905 	 */
2906 	if (aio_func == nodev)
2907 		return (NULL);
2908 
2909 	sp = VTOS(vp);
2910 	smark(sp, SACC);
2911 	return (aio_func);
2912 }
2913 
2914 /*
2915  * Clustering: We want check_vp to return a function prototyped
2916  * correctly that will be common to both PXFS and regular case.
2917  * We define this intermediate function that will do the right
2918  * thing for driver cases.
2919  */
2920 
2921 static int
2922 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2923 {
2924 	dev_t dev;
2925 	struct cb_ops  	*cb;
2926 
2927 	ASSERT(vp->v_type == VCHR);
2928 	ASSERT(!IS_PXFSVP(vp));
2929 	dev = VTOS(vp)->s_dev;
2930 	ASSERT(STREAMSTAB(getmajor(dev)) == NULL);
2931 
2932 	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2933 
2934 	ASSERT(cb->cb_awrite != nodev);
2935 	return ((*cb->cb_awrite)(dev, aio, cred_p));
2936 }
2937 
2938 /*
2939  * Clustering: We want check_vp to return a function prototyped
2940  * correctly that will be common to both PXFS and regular case.
2941  * We define this intermediate function that will do the right
2942  * thing for driver cases.
2943  */
2944 
2945 static int
2946 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2947 {
2948 	dev_t dev;
2949 	struct cb_ops  	*cb;
2950 
2951 	ASSERT(vp->v_type == VCHR);
2952 	ASSERT(!IS_PXFSVP(vp));
2953 	dev = VTOS(vp)->s_dev;
2954 	ASSERT(!STREAMSTAB(getmajor(dev)));
2955 
2956 	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2957 
2958 	ASSERT(cb->cb_aread != nodev);
2959 	return ((*cb->cb_aread)(dev, aio, cred_p));
2960 }
2961 
2962 /*
2963  * This routine is called when a largefile call is made by a 32bit
2964  * process on a ILP32 or LP64 kernel. All 64bit processes are large
2965  * file by definition and will call alio() instead.
2966  */
2967 static int
2968 alioLF(
2969 	int		mode_arg,
2970 	void		*aiocb_arg,
2971 	int		nent,
2972 	void		*sigev)
2973 {
2974 	file_t		*fp;
2975 	file_t		*prev_fp = NULL;
2976 	int		prev_mode = -1;
2977 	struct vnode	*vp;
2978 	aio_lio_t	*head;
2979 	aio_req_t	*reqp;
2980 	aio_t		*aiop;
2981 	caddr_t		cbplist;
2982 	aiocb64_32_t	*cbp;
2983 	caddr32_t	*ucbp;
2984 	aiocb64_32_t	cb64;
2985 	aiocb64_32_t	*aiocb = &cb64;
2986 #ifdef _LP64
2987 	aiocb_t		aiocb_n;
2988 #endif
2989 	struct sigevent32	sigevk;
2990 	sigqueue_t	*sqp;
2991 	int		(*aio_func)();
2992 	int		mode;
2993 	int		error = 0, aio_errors = 0;
2994 	int		i;
2995 	size_t		ssize;
2996 	int		deadhead = 0;
2997 	int		aio_notsupported = 0;
2998 	int		aio_use_port = 0;
2999 	port_kevent_t	*pkevtp = NULL;
3000 	port_notify32_t	pnotify;
3001 
3002 	aiop = curproc->p_aio;
3003 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
3004 		return (EINVAL);
3005 
3006 	ASSERT(get_udatamodel() == DATAMODEL_ILP32);
3007 
3008 	ssize = (sizeof (caddr32_t) * nent);
3009 	cbplist = kmem_alloc(ssize, KM_SLEEP);
3010 	ucbp = (caddr32_t *)cbplist;
3011 
3012 	if (copyin(aiocb_arg, cbplist, ssize)) {
3013 		kmem_free(cbplist, ssize);
3014 		return (EFAULT);
3015 	}
3016 
3017 	if (sigev) {
3018 		if (copyin(sigev, &sigevk, sizeof (sigevk))) {
3019 			kmem_free(cbplist, ssize);
3020 			return (EFAULT);
3021 		}
3022 	}
3023 
3024 	/*
3025 	 * a list head should be allocated if notification is
3026 	 * enabled for this list.
3027 	 */
3028 	head = NULL;
3029 
3030 	/* Event Ports  */
3031 
3032 	if (sigev && sigevk.sigev_notify == SIGEV_PORT) {
3033 		/* Use PORT for completion notification */
3034 		if (copyin((void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
3035 		    &pnotify, sizeof (port_notify32_t))) {
3036 			kmem_free(cbplist, ssize);
3037 			return (EFAULT);
3038 		}
3039 		/* use event ports for the list of aiocbs */
3040 		aio_use_port = 1;
3041 		error = port_alloc_event(pnotify.portnfy_port,
3042 		    PORT_ALLOC_PRIVATE, PORT_SOURCE_AIO, &pkevtp);
3043 		if (error) {
3044 			if (error == ENOMEM)
3045 				error = EAGAIN;
3046 			kmem_free(cbplist, ssize);
3047 			return (error);
3048 		}
3049 	} else if ((mode_arg == LIO_WAIT) || sigev) {
3050 		mutex_enter(&aiop->aio_mutex);
3051 		error = aio_lio_alloc(&head);
3052 		mutex_exit(&aiop->aio_mutex);
3053 		if (error)
3054 			goto done;
3055 		deadhead = 1;
3056 		head->lio_nent = nent;
3057 		head->lio_refcnt = nent;
3058 		if (sigev && (sigevk.sigev_notify == SIGEV_SIGNAL) &&
3059 		    (sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG)) {
3060 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3061 			if (sqp == NULL) {
3062 				error = EAGAIN;
3063 				goto done;
3064 			}
3065 			sqp->sq_func = NULL;
3066 			sqp->sq_next = NULL;
3067 			sqp->sq_info.si_code = SI_ASYNCIO;
3068 			sqp->sq_info.si_pid = curproc->p_pid;
3069 			sqp->sq_info.si_ctid = PRCTID(curproc);
3070 			sqp->sq_info.si_zoneid = getzoneid();
3071 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3072 			sqp->sq_info.si_signo = sigevk.sigev_signo;
3073 			sqp->sq_info.si_value.sival_int =
3074 			    sigevk.sigev_value.sival_int;
3075 			head->lio_sigqp = sqp;
3076 		} else {
3077 			head->lio_sigqp = NULL;
3078 		}
3079 	}
3080 
3081 	for (i = 0; i < nent; i++, ucbp++) {
3082 
3083 		cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
3084 		/* skip entry if it can't be copied. */
3085 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (aiocb64_32_t))) {
3086 			if (head) {
3087 				mutex_enter(&aiop->aio_mutex);
3088 				head->lio_nent--;
3089 				head->lio_refcnt--;
3090 				mutex_exit(&aiop->aio_mutex);
3091 			}
3092 			continue;
3093 		}
3094 
3095 		/* skip if opcode for aiocb is LIO_NOP */
3096 
3097 		mode = aiocb->aio_lio_opcode;
3098 		if (mode == LIO_NOP) {
3099 			cbp = NULL;
3100 			if (head) {
3101 				mutex_enter(&aiop->aio_mutex);
3102 				head->lio_nent--;
3103 				head->lio_refcnt--;
3104 				mutex_exit(&aiop->aio_mutex);
3105 			}
3106 			continue;
3107 		}
3108 
3109 		/* increment file descriptor's ref count. */
3110 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3111 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3112 			if (head) {
3113 				mutex_enter(&aiop->aio_mutex);
3114 				head->lio_nent--;
3115 				head->lio_refcnt--;
3116 				mutex_exit(&aiop->aio_mutex);
3117 			}
3118 			aio_errors++;
3119 			continue;
3120 		}
3121 
3122 		vp = fp->f_vnode;
3123 
3124 		/*
3125 		 * check the permission of the partition
3126 		 */
3127 		mode = aiocb->aio_lio_opcode;
3128 		if ((fp->f_flag & mode) == 0) {
3129 			releasef(aiocb->aio_fildes);
3130 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3131 			if (head) {
3132 				mutex_enter(&aiop->aio_mutex);
3133 				head->lio_nent--;
3134 				head->lio_refcnt--;
3135 				mutex_exit(&aiop->aio_mutex);
3136 			}
3137 			aio_errors++;
3138 			continue;
3139 		}
3140 
3141 		/*
3142 		 * common case where requests are to the same fd
3143 		 * for the same r/w operation
3144 		 * for UFS, need to set EBADFD
3145 		 */
3146 		if ((fp != prev_fp) || (mode != prev_mode)) {
3147 			aio_func = check_vp(vp, mode);
3148 			if (aio_func == NULL) {
3149 				prev_fp = NULL;
3150 				releasef(aiocb->aio_fildes);
3151 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
3152 				aio_notsupported++;
3153 				if (head) {
3154 					mutex_enter(&aiop->aio_mutex);
3155 					head->lio_nent--;
3156 					head->lio_refcnt--;
3157 					mutex_exit(&aiop->aio_mutex);
3158 				}
3159 				continue;
3160 			} else {
3161 				prev_fp = fp;
3162 				prev_mode = mode;
3163 			}
3164 		}
3165 #ifdef	_LP64
3166 		aiocb_LFton(aiocb, &aiocb_n);
3167 		error = aio_req_setup(&reqp, aiop, &aiocb_n,
3168 		    (aio_result_t *)&cbp->aio_resultp, aio_use_port, vp);
3169 #else
3170 		error = aio_req_setupLF(&reqp, aiop, aiocb,
3171 		    (aio_result_t *)&cbp->aio_resultp, aio_use_port, vp);
3172 #endif  /* _LP64 */
3173 		if (error) {
3174 			releasef(aiocb->aio_fildes);
3175 			if (head) {
3176 				mutex_enter(&aiop->aio_mutex);
3177 				head->lio_nent--;
3178 				head->lio_refcnt--;
3179 				mutex_exit(&aiop->aio_mutex);
3180 			}
3181 			aio_errors++;
3182 			continue;
3183 		}
3184 
3185 		reqp->aio_req_lio = head;
3186 		deadhead = 0;
3187 
3188 		/*
3189 		 * Set the errno field now before sending the request to
3190 		 * the driver to avoid a race condition
3191 		 */
3192 		(void) suword32(&cbp->aio_resultp.aio_errno,
3193 		    EINPROGRESS);
3194 
3195 		reqp->aio_req_iocb.iocb32 = *ucbp;
3196 
3197 		if (aio_use_port) {
3198 			reqp->aio_req_port = pnotify.portnfy_port;
3199 			error = aio_req_assoc_port32(&aiocb->aio_sigevent,
3200 			    (void *)(uintptr_t)pnotify.portnfy_user,
3201 			    (aiocb_t *)(uintptr_t)*ucbp, reqp, pkevtp);
3202 		}
3203 
3204 		/*
3205 		 * send the request to driver.
3206 		 * Clustering: If PXFS vnode, call PXFS function.
3207 		 */
3208 		if (error == 0) {
3209 			if (aiocb->aio_nbytes == 0) {
3210 				clear_active_fd(aiocb->aio_fildes);
3211 				aio_zerolen(reqp);
3212 				continue;
3213 			}
3214 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3215 			    CRED());
3216 		}
3217 
3218 		/*
3219 		 * the fd's ref count is not decremented until the IO has
3220 		 * completed unless there was an error.
3221 		 */
3222 		if (error) {
3223 			releasef(aiocb->aio_fildes);
3224 			lio_set_uerror(&cbp->aio_resultp, error);
3225 			if (head) {
3226 				mutex_enter(&aiop->aio_mutex);
3227 				head->lio_nent--;
3228 				head->lio_refcnt--;
3229 				mutex_exit(&aiop->aio_mutex);
3230 			}
3231 			if (error == ENOTSUP)
3232 				aio_notsupported++;
3233 			else
3234 				aio_errors++;
3235 			lio_set_error(reqp);
3236 		} else {
3237 			clear_active_fd(aiocb->aio_fildes);
3238 		}
3239 	}
3240 
3241 	if (pkevtp)
3242 		port_free_event(pkevtp);
3243 
3244 	if (aio_notsupported) {
3245 		error = ENOTSUP;
3246 	} else if (aio_errors) {
3247 		/*
3248 		 * return EIO if any request failed
3249 		 */
3250 		error = EIO;
3251 	}
3252 
3253 	if (mode_arg == LIO_WAIT) {
3254 		mutex_enter(&aiop->aio_mutex);
3255 		while (head->lio_refcnt > 0) {
3256 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3257 				mutex_exit(&aiop->aio_mutex);
3258 				error = EINTR;
3259 				goto done;
3260 			}
3261 		}
3262 		mutex_exit(&aiop->aio_mutex);
3263 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
3264 	}
3265 
3266 done:
3267 	kmem_free(cbplist, ssize);
3268 	if (deadhead) {
3269 		if (head->lio_sigqp)
3270 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3271 		kmem_free(head, sizeof (aio_lio_t));
3272 	}
3273 	return (error);
3274 }
3275 
3276 #ifdef  _SYSCALL32_IMPL
3277 static void
3278 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
3279 {
3280 	dest->aio_fildes = src->aio_fildes;
3281 	dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
3282 	dest->aio_nbytes = (size_t)src->aio_nbytes;
3283 	dest->aio_offset = (off_t)src->aio_offset;
3284 	dest->aio_reqprio = src->aio_reqprio;
3285 	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3286 	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3287 
3288 	/*
3289 	 * See comment in sigqueue32() on handling of 32-bit
3290 	 * sigvals in a 64-bit kernel.
3291 	 */
3292 	dest->aio_sigevent.sigev_value.sival_int =
3293 	    (int)src->aio_sigevent.sigev_value.sival_int;
3294 	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3295 	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3296 	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3297 	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3298 	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3299 	dest->aio_lio_opcode = src->aio_lio_opcode;
3300 	dest->aio_state = src->aio_state;
3301 	dest->aio__pad[0] = src->aio__pad[0];
3302 }
3303 #endif
3304 
3305 /*
3306  * This function is used only for largefile calls made by
3307  * 32 bit applications on 32 bit kernel.
3308  */
3309 static int
3310 aio_req_setupLF(
3311 	aio_req_t	**reqpp,
3312 	aio_t		*aiop,
3313 	aiocb64_32_t	*arg,
3314 	aio_result_t	*resultp,
3315 	int		port,
3316 	vnode_t		*vp)
3317 {
3318 	aio_req_t	*reqp;
3319 	sigqueue_t	*sqp;
3320 	struct	uio	*uio;
3321 
3322 	struct	sigevent *sigev;
3323 	int 		error;
3324 
3325 	sigev = (struct	sigevent *)&arg->aio_sigevent;
3326 	if ((sigev->sigev_notify == SIGEV_SIGNAL) &&
3327 	    (sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG)) {
3328 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3329 		if (sqp == NULL)
3330 			return (EAGAIN);
3331 		sqp->sq_func = NULL;
3332 		sqp->sq_next = NULL;
3333 		sqp->sq_info.si_code = SI_ASYNCIO;
3334 		sqp->sq_info.si_pid = curproc->p_pid;
3335 		sqp->sq_info.si_ctid = PRCTID(curproc);
3336 		sqp->sq_info.si_zoneid = getzoneid();
3337 		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3338 		sqp->sq_info.si_signo = sigev->sigev_signo;
3339 		sqp->sq_info.si_value = sigev->sigev_value;
3340 	} else
3341 		sqp = NULL;
3342 
3343 	mutex_enter(&aiop->aio_mutex);
3344 
3345 	if (aiop->aio_flags & AIO_REQ_BLOCK) {
3346 		mutex_exit(&aiop->aio_mutex);
3347 		if (sqp)
3348 			kmem_free(sqp, sizeof (sigqueue_t));
3349 		return (EIO);
3350 	}
3351 	/*
3352 	 * get an aio_reqp from the free list or allocate one
3353 	 * from dynamic memory.
3354 	 */
3355 	if (error = aio_req_alloc(&reqp, resultp)) {
3356 		mutex_exit(&aiop->aio_mutex);
3357 		if (sqp)
3358 			kmem_free(sqp, sizeof (sigqueue_t));
3359 		return (error);
3360 	}
3361 	aiop->aio_pending++;
3362 	aiop->aio_outstanding++;
3363 	reqp->aio_req_flags = AIO_PENDING;
3364 	if (port)
3365 		aio_enq_port_pending(aiop, reqp);
3366 	mutex_exit(&aiop->aio_mutex);
3367 	/*
3368 	 * initialize aio request.
3369 	 */
3370 	reqp->aio_req_fd = arg->aio_fildes;
3371 	reqp->aio_req_sigqp = sqp;
3372 	reqp->aio_req_iocb.iocb = NULL;
3373 	reqp->aio_req_buf.b_file = vp;
3374 	uio = reqp->aio_req.aio_uio;
3375 	uio->uio_iovcnt = 1;
3376 	uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
3377 	uio->uio_iov->iov_len = arg->aio_nbytes;
3378 	uio->uio_loffset = arg->aio_offset;
3379 	*reqpp = reqp;
3380 	return (0);
3381 }
3382 
3383 /*
3384  * This routine is called when a non largefile call is made by a 32bit
3385  * process on a ILP32 or LP64 kernel.
3386  */
3387 static int
3388 alio32(
3389 	int		mode_arg,
3390 	void		*aiocb_arg,
3391 	int		nent,
3392 	void		*sigev_arg)
3393 {
3394 	file_t		*fp;
3395 	file_t		*prev_fp = NULL;
3396 	int		prev_mode = -1;
3397 	struct vnode	*vp;
3398 	aio_lio_t	*head;
3399 	aio_req_t	*reqp;
3400 	aio_t		*aiop;
3401 	aiocb_t		cb;
3402 	aiocb_t		*aiocb = &cb;
3403 	caddr_t		cbplist;
3404 #ifdef	_LP64
3405 	aiocb32_t	*cbp;
3406 	caddr32_t	*ucbp;
3407 	aiocb32_t	cb32;
3408 	aiocb32_t	*aiocb32 = &cb32;
3409 	struct sigevent32	sigev;
3410 #else
3411 	aiocb_t		*cbp, **ucbp;
3412 	struct sigevent	sigev;
3413 #endif
3414 	sigqueue_t	*sqp;
3415 	int		(*aio_func)();
3416 	int		mode;
3417 	int		error = 0, aio_errors = 0;
3418 	int		i;
3419 	size_t		ssize;
3420 	int		deadhead = 0;
3421 	int		aio_notsupported = 0;
3422 	int		aio_use_port = 0;
3423 	port_kevent_t	*pkevtp = NULL;
3424 #ifdef	_LP64
3425 	port_notify32_t	pnotify;
3426 #else
3427 	port_notify_t	pnotify;
3428 #endif
3429 	aiop = curproc->p_aio;
3430 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
3431 		return (EINVAL);
3432 
3433 #ifdef	_LP64
3434 	ssize = (sizeof (caddr32_t) * nent);
3435 #else
3436 	ssize = (sizeof (aiocb_t *) * nent);
3437 #endif
3438 	cbplist = kmem_alloc(ssize, KM_SLEEP);
3439 	ucbp = (void *)cbplist;
3440 
3441 	if (copyin(aiocb_arg, cbplist, ssize)) {
3442 		kmem_free(cbplist, ssize);
3443 		return (EFAULT);
3444 	}
3445 
3446 	if (sigev_arg) {
3447 		if (copyin(sigev_arg, &sigev, sizeof (struct sigevent32))) {
3448 			kmem_free(cbplist, ssize);
3449 			return (EFAULT);
3450 		}
3451 	}
3452 
3453 	/*
3454 	 * a list head should be allocated if notification is
3455 	 * enabled for this list.
3456 	 */
3457 	head = NULL;
3458 
3459 	/* Event Ports  */
3460 
3461 	if (sigev_arg && sigev.sigev_notify == SIGEV_PORT) {
3462 		/* Use PORT for completion notification */
3463 		if (copyin((void *)(uintptr_t)sigev.sigev_value.sival_ptr,
3464 		    &pnotify, sizeof (port_notify32_t))) {
3465 			kmem_free(cbplist, ssize);
3466 			return (EFAULT);
3467 		}
3468 		/* use event ports for the list of aiocbs */
3469 		aio_use_port = 1;
3470 		error = port_alloc_event(pnotify.portnfy_port,
3471 		    PORT_ALLOC_PRIVATE, PORT_SOURCE_AIO, &pkevtp);
3472 		if (error) {
3473 			if ((error == ENOMEM) || (error == EAGAIN))
3474 				error = EAGAIN;
3475 			else
3476 				error = EINVAL;
3477 			kmem_free(cbplist, ssize);
3478 			return (error);
3479 		}
3480 	} else if ((mode_arg == LIO_WAIT) || sigev_arg) {
3481 		mutex_enter(&aiop->aio_mutex);
3482 		error = aio_lio_alloc(&head);
3483 		mutex_exit(&aiop->aio_mutex);
3484 		if (error)
3485 			goto done;
3486 		deadhead = 1;
3487 		head->lio_nent = nent;
3488 		head->lio_refcnt = nent;
3489 		if (sigev_arg && (sigev.sigev_notify == SIGEV_SIGNAL) &&
3490 		    (sigev.sigev_signo > 0 && sigev.sigev_signo < NSIG)) {
3491 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3492 			if (sqp == NULL) {
3493 				error = EAGAIN;
3494 				goto done;
3495 			}
3496 			sqp->sq_func = NULL;
3497 			sqp->sq_next = NULL;
3498 			sqp->sq_info.si_code = SI_ASYNCIO;
3499 			sqp->sq_info.si_pid = curproc->p_pid;
3500 			sqp->sq_info.si_ctid = PRCTID(curproc);
3501 			sqp->sq_info.si_zoneid = getzoneid();
3502 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3503 			sqp->sq_info.si_signo = sigev.sigev_signo;
3504 			sqp->sq_info.si_value.sival_int =
3505 			    sigev.sigev_value.sival_int;
3506 			head->lio_sigqp = sqp;
3507 		} else {
3508 			head->lio_sigqp = NULL;
3509 		}
3510 	}
3511 
3512 	for (i = 0; i < nent; i++, ucbp++) {
3513 
3514 		/* skip entry if it can't be copied. */
3515 #ifdef	_LP64
3516 		cbp = (aiocb32_t *)(uintptr_t)*ucbp;
3517 		if (cbp == NULL || copyin(cbp, aiocb32, sizeof (aiocb32_t))) {
3518 #else
3519 		cbp = (aiocb_t *)*ucbp;
3520 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (aiocb_t))) {
3521 #endif
3522 			if (head) {
3523 				mutex_enter(&aiop->aio_mutex);
3524 				head->lio_nent--;
3525 				head->lio_refcnt--;
3526 				mutex_exit(&aiop->aio_mutex);
3527 			}
3528 			continue;
3529 		}
3530 #ifdef	_LP64
3531 		/*
3532 		 * copy 32 bit structure into 64 bit structure
3533 		 */
3534 		aiocb_32ton(aiocb32, aiocb);
3535 #endif /* _LP64 */
3536 
3537 		/* skip if opcode for aiocb is LIO_NOP */
3538 
3539 		mode = aiocb->aio_lio_opcode;
3540 		if (mode == LIO_NOP) {
3541 			cbp = NULL;
3542 			if (head) {
3543 				mutex_enter(&aiop->aio_mutex);
3544 				head->lio_nent--;
3545 				head->lio_refcnt--;
3546 				mutex_exit(&aiop->aio_mutex);
3547 			}
3548 			continue;
3549 		}
3550 
3551 		/* increment file descriptor's ref count. */
3552 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3553 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3554 			if (head) {
3555 				mutex_enter(&aiop->aio_mutex);
3556 				head->lio_nent--;
3557 				head->lio_refcnt--;
3558 				mutex_exit(&aiop->aio_mutex);
3559 			}
3560 			aio_errors++;
3561 			continue;
3562 		}
3563 
3564 		vp = fp->f_vnode;
3565 
3566 		/*
3567 		 * check the permission of the partition
3568 		 */
3569 		mode = aiocb->aio_lio_opcode;
3570 		if ((fp->f_flag & mode) == 0) {
3571 			releasef(aiocb->aio_fildes);
3572 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3573 			if (head) {
3574 				mutex_enter(&aiop->aio_mutex);
3575 				head->lio_nent--;
3576 				head->lio_refcnt--;
3577 				mutex_exit(&aiop->aio_mutex);
3578 			}
3579 			aio_errors++;
3580 			continue;
3581 		}
3582 
3583 		/*
3584 		 * common case where requests are to the same fd
3585 		 * for the same r/w operation
3586 		 * for UFS, need to set EBADFD
3587 		 */
3588 		if ((fp != prev_fp) || (mode != prev_mode)) {
3589 			aio_func = check_vp(vp, mode);
3590 			if (aio_func == NULL) {
3591 				prev_fp = NULL;
3592 				releasef(aiocb->aio_fildes);
3593 				lio_set_uerror(&cbp->aio_resultp,
3594 				    EBADFD);
3595 				aio_notsupported++;
3596 				if (head) {
3597 					mutex_enter(&aiop->aio_mutex);
3598 					head->lio_nent--;
3599 					head->lio_refcnt--;
3600 					mutex_exit(&aiop->aio_mutex);
3601 				}
3602 				continue;
3603 			} else {
3604 				prev_fp = fp;
3605 				prev_mode = mode;
3606 			}
3607 		}
3608 		if (error = aio_req_setup(&reqp, aiop, aiocb,
3609 		    (aio_result_t *)&cbp->aio_resultp, aio_use_port, vp)) {
3610 			releasef(aiocb->aio_fildes);
3611 			lio_set_uerror(&cbp->aio_resultp, error);
3612 			if (head) {
3613 				mutex_enter(&aiop->aio_mutex);
3614 				head->lio_nent--;
3615 				head->lio_refcnt--;
3616 				mutex_exit(&aiop->aio_mutex);
3617 			}
3618 			aio_errors++;
3619 			continue;
3620 		}
3621 
3622 		reqp->aio_req_lio = head;
3623 		deadhead = 0;
3624 
3625 		/*
3626 		 * Set the errno field now before sending the request to
3627 		 * the driver to avoid a race condition
3628 		 */
3629 		(void) suword32(&cbp->aio_resultp.aio_errno,
3630 		    EINPROGRESS);
3631 
3632 		reqp->aio_req_iocb.iocb32 = ((caddr32_t *)cbplist)[i];
3633 
3634 		if (aio_use_port) {
3635 			reqp->aio_req_port = pnotify.portnfy_port;
3636 #ifdef _LP64
3637 			error = aio_req_assoc_port32(&aiocb32->aio_sigevent,
3638 			    (void *)(uintptr_t)pnotify.portnfy_user,
3639 			    (aiocb_t *)(uintptr_t)(((caddr32_t *)cbplist)[i]),
3640 			    reqp, pkevtp);
3641 #else
3642 			error = aio_req_assoc_port(&aiocb->aio_sigevent,
3643 			    pnotify.portnfy_user,
3644 			    (aiocb_t *)(((caddr32_t *)cbplist)[i]),
3645 			    reqp, pkevtp);
3646 #endif
3647 		}
3648 
3649 		/*
3650 		 * send the request to driver.
3651 		 * Clustering: If PXFS vnode, call PXFS function.
3652 		 */
3653 		if (error == 0) {
3654 			if (aiocb->aio_nbytes == 0) {
3655 				clear_active_fd(aiocb->aio_fildes);
3656 				aio_zerolen(reqp);
3657 				continue;
3658 			}
3659 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3660 			    CRED());
3661 		}
3662 
3663 		/*
3664 		 * the fd's ref count is not decremented until the IO has
3665 		 * completed unless there was an error.
3666 		 */
3667 		if (error) {
3668 			releasef(aiocb->aio_fildes);
3669 			lio_set_uerror(&cbp->aio_resultp, error);
3670 			if (head) {
3671 				mutex_enter(&aiop->aio_mutex);
3672 				head->lio_nent--;
3673 				head->lio_refcnt--;
3674 				mutex_exit(&aiop->aio_mutex);
3675 			}
3676 			if (error == ENOTSUP)
3677 				aio_notsupported++;
3678 			else
3679 				aio_errors++;
3680 			lio_set_error(reqp);
3681 		} else {
3682 			clear_active_fd(aiocb->aio_fildes);
3683 		}
3684 	}
3685 
3686 	if (pkevtp)
3687 		port_free_event(pkevtp);
3688 
3689 	if (aio_notsupported) {
3690 		error = ENOTSUP;
3691 	} else if (aio_errors) {
3692 		/*
3693 		 * return EIO if any request failed
3694 		 */
3695 		error = EIO;
3696 	}
3697 
3698 	if (mode_arg == LIO_WAIT) {
3699 		mutex_enter(&aiop->aio_mutex);
3700 		while (head->lio_refcnt > 0) {
3701 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3702 				mutex_exit(&aiop->aio_mutex);
3703 				error = EINTR;
3704 				goto done;
3705 			}
3706 		}
3707 		mutex_exit(&aiop->aio_mutex);
3708 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32);
3709 	}
3710 
3711 done:
3712 	kmem_free(cbplist, ssize);
3713 	if (deadhead) {
3714 		if (head->lio_sigqp)
3715 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3716 		kmem_free(head, sizeof (aio_lio_t));
3717 	}
3718 	return (error);
3719 }
3720 
3721 
3722 #ifdef  _SYSCALL32_IMPL
3723 void
3724 aiocb_32ton(aiocb32_t *src, aiocb_t *dest)
3725 {
3726 	dest->aio_fildes = src->aio_fildes;
3727 	dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf;
3728 	dest->aio_nbytes = (size_t)src->aio_nbytes;
3729 	dest->aio_offset = (off_t)src->aio_offset;
3730 	dest->aio_reqprio = src->aio_reqprio;
3731 	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3732 	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3733 
3734 	/*
3735 	 * See comment in sigqueue32() on handling of 32-bit
3736 	 * sigvals in a 64-bit kernel.
3737 	 */
3738 	dest->aio_sigevent.sigev_value.sival_int =
3739 	    (int)src->aio_sigevent.sigev_value.sival_int;
3740 	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3741 	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3742 	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3743 	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3744 	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3745 	dest->aio_lio_opcode = src->aio_lio_opcode;
3746 	dest->aio_state = src->aio_state;
3747 	dest->aio__pad[0] = src->aio__pad[0];
3748 }
3749 #endif /* _SYSCALL32_IMPL */
3750 
3751 /*
3752  * aio_port_callback() is called just before the event is retrieved from the
3753  * port. The task of this callback function is to finish the work of the
3754  * transaction for the application, it means :
3755  * - copyout transaction data to the application
3756  *	(this thread is running in the right process context)
3757  * - keep trace of the transaction (update of counters).
3758  * - free allocated buffers
3759  * The aiocb pointer is the object element of the port_kevent_t structure.
3760  *
3761  * flag :
3762  *	PORT_CALLBACK_DEFAULT : do copyout and free resources
3763  *	PORT_CALLBACK_CLOSE   : don't do copyout, free resources
3764  */
3765 
3766 /*ARGSUSED*/
3767 int
3768 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp)
3769 {
3770 	aio_t		*aiop = curproc->p_aio;
3771 	aio_req_t	*reqp = arg;
3772 	struct	iovec	*iov;
3773 	struct	buf	*bp;
3774 	void		*resultp;
3775 
3776 	if (pid != curproc->p_pid) {
3777 		/* wrong proc !!, can not deliver data here ... */
3778 		return (EACCES);
3779 	}
3780 
3781 	mutex_enter(&aiop->aio_portq_mutex);
3782 	reqp->aio_req_portkev = NULL;
3783 	aio_req_remove_portq(aiop, reqp); /* remove request from portq */
3784 	mutex_exit(&aiop->aio_portq_mutex);
3785 	aphysio_unlock(reqp);		/* unlock used pages */
3786 	mutex_enter(&aiop->aio_mutex);
3787 	if (reqp->aio_req_flags & AIO_COPYOUTDONE) {
3788 		aio_req_free_port(aiop, reqp);	/* back to free list */
3789 		mutex_exit(&aiop->aio_mutex);
3790 		return (0);
3791 	}
3792 
3793 	iov = reqp->aio_req_uio.uio_iov;
3794 	bp = &reqp->aio_req_buf;
3795 	resultp = (void *)reqp->aio_req_resultp;
3796 	aio_req_free_port(aiop, reqp);	/* request struct back to free list */
3797 	mutex_exit(&aiop->aio_mutex);
3798 	if (flag == PORT_CALLBACK_DEFAULT)
3799 		aio_copyout_result_port(iov, bp, resultp);
3800 	return (0);
3801 }
3802