1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Kernel asynchronous I/O.
29 * This is only for raw devices now (as of Nov. 1993).
30 */
31
32 #include <sys/types.h>
33 #include <sys/errno.h>
34 #include <sys/conf.h>
35 #include <sys/file.h>
36 #include <sys/fs/snode.h>
37 #include <sys/unistd.h>
38 #include <sys/cmn_err.h>
39 #include <vm/as.h>
40 #include <vm/faultcode.h>
41 #include <sys/sysmacros.h>
42 #include <sys/procfs.h>
43 #include <sys/kmem.h>
44 #include <sys/autoconf.h>
45 #include <sys/ddi_impldefs.h>
46 #include <sys/sunddi.h>
47 #include <sys/aio_impl.h>
48 #include <sys/debug.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/vmsystm.h>
52 #include <sys/fs/pxfs_ki.h>
53 #include <sys/contract/process_impl.h>
54
55 /*
56 * external entry point.
57 */
58 #ifdef _LP64
59 static int64_t kaioc(long, long, long, long, long, long);
60 #endif
61 static int kaio(ulong_t *, rval_t *);
62
63
64 #define AIO_64 0
65 #define AIO_32 1
66 #define AIO_LARGEFILE 2
67
68 /*
69 * implementation specific functions (private)
70 */
71 #ifdef _LP64
72 static int alio(int, aiocb_t **, int, struct sigevent *);
73 #endif
74 static int aionotify(void);
75 static int aioinit(void);
76 static int aiostart(void);
77 static void alio_cleanup(aio_t *, aiocb_t **, int, int);
78 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
79 cred_t *);
80 static void lio_set_error(aio_req_t *, int portused);
81 static aio_t *aio_aiop_alloc();
82 static int aio_req_alloc(aio_req_t **, aio_result_t *);
83 static int aio_lio_alloc(aio_lio_t **);
84 static aio_req_t *aio_req_done(void *);
85 static aio_req_t *aio_req_remove(aio_req_t *);
86 static int aio_req_find(aio_result_t *, aio_req_t **);
87 static int aio_hash_insert(struct aio_req_t *, aio_t *);
88 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
89 aio_result_t *, vnode_t *, int);
90 static int aio_cleanup_thread(aio_t *);
91 static aio_lio_t *aio_list_get(aio_result_t *);
92 static void lio_set_uerror(void *, int);
93 extern void aio_zerolen(aio_req_t *);
94 static int aiowait(struct timeval *, int, long *);
95 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
96 static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
97 aio_req_t *reqlist, aio_t *aiop, model_t model);
98 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
99 static int aiosuspend(void *, int, struct timespec *, int,
100 long *, int);
101 static int aliowait(int, void *, int, void *, int);
102 static int aioerror(void *, int);
103 static int aio_cancel(int, void *, long *, int);
104 static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
105 static int aiorw(int, void *, int, int);
106
107 static int alioLF(int, void *, int, void *);
108 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *,
109 aio_result_t *, vnode_t *, int);
110 static int alio32(int, void *, int, void *);
111 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
112 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
113
114 #ifdef _SYSCALL32_IMPL
115 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
116 void aiocb_32ton(aiocb32_t *, aiocb_t *);
117 #endif /* _SYSCALL32_IMPL */
118
119 /*
120 * implementation specific functions (external)
121 */
122 void aio_req_free(aio_t *, aio_req_t *);
123
124 /*
125 * Event Port framework
126 */
127
128 void aio_req_free_port(aio_t *, aio_req_t *);
129 static int aio_port_callback(void *, int *, pid_t, int, void *);
130
131 /*
132 * This is the loadable module wrapper.
133 */
134 #include <sys/modctl.h>
135 #include <sys/syscall.h>
136
137 #ifdef _LP64
138
139 static struct sysent kaio_sysent = {
140 6,
141 SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
142 (int (*)())kaioc
143 };
144
145 #ifdef _SYSCALL32_IMPL
146 static struct sysent kaio_sysent32 = {
147 7,
148 SE_NOUNLOAD | SE_64RVAL,
149 kaio
150 };
151 #endif /* _SYSCALL32_IMPL */
152
153 #else /* _LP64 */
154
155 static struct sysent kaio_sysent = {
156 7,
157 SE_NOUNLOAD | SE_32RVAL1,
158 kaio
159 };
160
161 #endif /* _LP64 */
162
163 /*
164 * Module linkage information for the kernel.
165 */
166
167 static struct modlsys modlsys = {
168 &mod_syscallops,
169 "kernel Async I/O",
170 &kaio_sysent
171 };
172
173 #ifdef _SYSCALL32_IMPL
174 static struct modlsys modlsys32 = {
175 &mod_syscallops32,
176 "kernel Async I/O for 32 bit compatibility",
177 &kaio_sysent32
178 };
179 #endif /* _SYSCALL32_IMPL */
180
181
182 static struct modlinkage modlinkage = {
183 MODREV_1,
184 &modlsys,
185 #ifdef _SYSCALL32_IMPL
186 &modlsys32,
187 #endif
188 NULL
189 };
190
191 int
_init(void)192 _init(void)
193 {
194 int retval;
195
196 if ((retval = mod_install(&modlinkage)) != 0)
197 return (retval);
198
199 return (0);
200 }
201
202 int
_fini(void)203 _fini(void)
204 {
205 int retval;
206
207 retval = mod_remove(&modlinkage);
208
209 return (retval);
210 }
211
212 int
_info(struct modinfo * modinfop)213 _info(struct modinfo *modinfop)
214 {
215 return (mod_info(&modlinkage, modinfop));
216 }
217
218 #ifdef _LP64
219 static int64_t
kaioc(long a0,long a1,long a2,long a3,long a4,long a5)220 kaioc(
221 long a0,
222 long a1,
223 long a2,
224 long a3,
225 long a4,
226 long a5)
227 {
228 int error;
229 long rval = 0;
230
231 switch ((int)a0 & ~AIO_POLL_BIT) {
232 case AIOREAD:
233 error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
234 (offset_t)a4, (aio_result_t *)a5, FREAD);
235 break;
236 case AIOWRITE:
237 error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
238 (offset_t)a4, (aio_result_t *)a5, FWRITE);
239 break;
240 case AIOWAIT:
241 error = aiowait((struct timeval *)a1, (int)a2, &rval);
242 break;
243 case AIOWAITN:
244 error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
245 (timespec_t *)a4);
246 break;
247 case AIONOTIFY:
248 error = aionotify();
249 break;
250 case AIOINIT:
251 error = aioinit();
252 break;
253 case AIOSTART:
254 error = aiostart();
255 break;
256 case AIOLIO:
257 error = alio((int)a1, (aiocb_t **)a2, (int)a3,
258 (struct sigevent *)a4);
259 break;
260 case AIOLIOWAIT:
261 error = aliowait((int)a1, (void *)a2, (int)a3,
262 (struct sigevent *)a4, AIO_64);
263 break;
264 case AIOSUSPEND:
265 error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
266 (int)a4, &rval, AIO_64);
267 break;
268 case AIOERROR:
269 error = aioerror((void *)a1, AIO_64);
270 break;
271 case AIOAREAD:
272 error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
273 break;
274 case AIOAWRITE:
275 error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
276 break;
277 case AIOCANCEL:
278 error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
279 break;
280
281 /*
282 * The large file related stuff is valid only for
283 * 32 bit kernel and not for 64 bit kernel
284 * On 64 bit kernel we convert large file calls
285 * to regular 64bit calls.
286 */
287
288 default:
289 error = EINVAL;
290 }
291 if (error)
292 return ((int64_t)set_errno(error));
293 return (rval);
294 }
295 #endif
296
297 static int
kaio(ulong_t * uap,rval_t * rvp)298 kaio(
299 ulong_t *uap,
300 rval_t *rvp)
301 {
302 long rval = 0;
303 int error = 0;
304 offset_t off;
305
306
307 rvp->r_vals = 0;
308 #if defined(_LITTLE_ENDIAN)
309 off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
310 #else
311 off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
312 #endif
313
314 switch (uap[0] & ~AIO_POLL_BIT) {
315 /*
316 * It must be the 32 bit system call on 64 bit kernel
317 */
318 case AIOREAD:
319 return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
320 (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
321 case AIOWRITE:
322 return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
323 (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
324 case AIOWAIT:
325 error = aiowait((struct timeval *)uap[1], (int)uap[2],
326 &rval);
327 break;
328 case AIOWAITN:
329 error = aiowaitn((void *)uap[1], (uint_t)uap[2],
330 (uint_t *)uap[3], (timespec_t *)uap[4]);
331 break;
332 case AIONOTIFY:
333 return (aionotify());
334 case AIOINIT:
335 return (aioinit());
336 case AIOSTART:
337 return (aiostart());
338 case AIOLIO:
339 return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
340 (void *)uap[4]));
341 case AIOLIOWAIT:
342 return (aliowait((int)uap[1], (void *)uap[2],
343 (int)uap[3], (struct sigevent *)uap[4], AIO_32));
344 case AIOSUSPEND:
345 error = aiosuspend((void *)uap[1], (int)uap[2],
346 (timespec_t *)uap[3], (int)uap[4],
347 &rval, AIO_32);
348 break;
349 case AIOERROR:
350 return (aioerror((void *)uap[1], AIO_32));
351 case AIOAREAD:
352 return (aiorw((int)uap[0], (void *)uap[1],
353 FREAD, AIO_32));
354 case AIOAWRITE:
355 return (aiorw((int)uap[0], (void *)uap[1],
356 FWRITE, AIO_32));
357 case AIOCANCEL:
358 error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
359 AIO_32));
360 break;
361 case AIOLIO64:
362 return (alioLF((int)uap[1], (void *)uap[2],
363 (int)uap[3], (void *)uap[4]));
364 case AIOLIOWAIT64:
365 return (aliowait(uap[1], (void *)uap[2],
366 (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
367 case AIOSUSPEND64:
368 error = aiosuspend((void *)uap[1], (int)uap[2],
369 (timespec_t *)uap[3], (int)uap[4], &rval,
370 AIO_LARGEFILE);
371 break;
372 case AIOERROR64:
373 return (aioerror((void *)uap[1], AIO_LARGEFILE));
374 case AIOAREAD64:
375 return (aiorw((int)uap[0], (void *)uap[1], FREAD,
376 AIO_LARGEFILE));
377 case AIOAWRITE64:
378 return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
379 AIO_LARGEFILE));
380 case AIOCANCEL64:
381 error = (aio_cancel((int)uap[1], (void *)uap[2],
382 &rval, AIO_LARGEFILE));
383 break;
384 default:
385 return (EINVAL);
386 }
387
388 rvp->r_val1 = rval;
389 return (error);
390 }
391
392 /*
393 * wake up LWPs in this process that are sleeping in
394 * aiowait().
395 */
396 static int
aionotify(void)397 aionotify(void)
398 {
399 aio_t *aiop;
400
401 aiop = curproc->p_aio;
402 if (aiop == NULL)
403 return (0);
404
405 mutex_enter(&aiop->aio_mutex);
406 aiop->aio_notifycnt++;
407 cv_broadcast(&aiop->aio_waitcv);
408 mutex_exit(&aiop->aio_mutex);
409
410 return (0);
411 }
412
413 static int
timeval2reltime(struct timeval * timout,timestruc_t * rqtime,timestruc_t ** rqtp,int * blocking)414 timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
415 timestruc_t **rqtp, int *blocking)
416 {
417 #ifdef _SYSCALL32_IMPL
418 struct timeval32 wait_time_32;
419 #endif
420 struct timeval wait_time;
421 model_t model = get_udatamodel();
422
423 *rqtp = NULL;
424 if (timout == NULL) { /* wait indefinitely */
425 *blocking = 1;
426 return (0);
427 }
428
429 /*
430 * Need to correctly compare with the -1 passed in for a user
431 * address pointer, with both 32 bit and 64 bit apps.
432 */
433 if (model == DATAMODEL_NATIVE) {
434 if ((intptr_t)timout == (intptr_t)-1) { /* don't wait */
435 *blocking = 0;
436 return (0);
437 }
438
439 if (copyin(timout, &wait_time, sizeof (wait_time)))
440 return (EFAULT);
441 }
442 #ifdef _SYSCALL32_IMPL
443 else {
444 /*
445 * -1 from a 32bit app. It will not get sign extended.
446 * don't wait if -1.
447 */
448 if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
449 *blocking = 0;
450 return (0);
451 }
452
453 if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
454 return (EFAULT);
455 TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
456 }
457 #endif /* _SYSCALL32_IMPL */
458
459 if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) { /* don't wait */
460 *blocking = 0;
461 return (0);
462 }
463
464 if (wait_time.tv_sec < 0 ||
465 wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
466 return (EINVAL);
467
468 rqtime->tv_sec = wait_time.tv_sec;
469 rqtime->tv_nsec = wait_time.tv_usec * 1000;
470 *rqtp = rqtime;
471 *blocking = 1;
472
473 return (0);
474 }
475
476 static int
timespec2reltime(timespec_t * timout,timestruc_t * rqtime,timestruc_t ** rqtp,int * blocking)477 timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
478 timestruc_t **rqtp, int *blocking)
479 {
480 #ifdef _SYSCALL32_IMPL
481 timespec32_t wait_time_32;
482 #endif
483 model_t model = get_udatamodel();
484
485 *rqtp = NULL;
486 if (timout == NULL) {
487 *blocking = 1;
488 return (0);
489 }
490
491 if (model == DATAMODEL_NATIVE) {
492 if (copyin(timout, rqtime, sizeof (*rqtime)))
493 return (EFAULT);
494 }
495 #ifdef _SYSCALL32_IMPL
496 else {
497 if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
498 return (EFAULT);
499 TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
500 }
501 #endif /* _SYSCALL32_IMPL */
502
503 if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
504 *blocking = 0;
505 return (0);
506 }
507
508 if (rqtime->tv_sec < 0 ||
509 rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
510 return (EINVAL);
511
512 *rqtp = rqtime;
513 *blocking = 1;
514
515 return (0);
516 }
517
518 /*ARGSUSED*/
519 static int
aiowait(struct timeval * timout,int dontblockflg,long * rval)520 aiowait(
521 struct timeval *timout,
522 int dontblockflg,
523 long *rval)
524 {
525 int error;
526 aio_t *aiop;
527 aio_req_t *reqp;
528 clock_t status;
529 int blocking;
530 int timecheck;
531 timestruc_t rqtime;
532 timestruc_t *rqtp;
533
534 aiop = curproc->p_aio;
535 if (aiop == NULL)
536 return (EINVAL);
537
538 /*
539 * Establish the absolute future time for the timeout.
540 */
541 error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
542 if (error)
543 return (error);
544 if (rqtp) {
545 timestruc_t now;
546 timecheck = timechanged;
547 gethrestime(&now);
548 timespecadd(rqtp, &now);
549 }
550
551 mutex_enter(&aiop->aio_mutex);
552 for (;;) {
553 /* process requests on poll queue */
554 if (aiop->aio_pollq) {
555 mutex_exit(&aiop->aio_mutex);
556 aio_cleanup(0);
557 mutex_enter(&aiop->aio_mutex);
558 }
559 if ((reqp = aio_req_remove(NULL)) != NULL) {
560 *rval = (long)reqp->aio_req_resultp;
561 break;
562 }
563 /* user-level done queue might not be empty */
564 if (aiop->aio_notifycnt > 0) {
565 aiop->aio_notifycnt--;
566 *rval = 1;
567 break;
568 }
569 /* don't block if no outstanding aio */
570 if (aiop->aio_outstanding == 0 && dontblockflg) {
571 error = EINVAL;
572 break;
573 }
574 if (blocking) {
575 status = cv_waituntil_sig(&aiop->aio_waitcv,
576 &aiop->aio_mutex, rqtp, timecheck);
577
578 if (status > 0) /* check done queue again */
579 continue;
580 if (status == 0) { /* interrupted by a signal */
581 error = EINTR;
582 *rval = -1;
583 } else { /* timer expired */
584 error = ETIME;
585 }
586 }
587 break;
588 }
589 mutex_exit(&aiop->aio_mutex);
590 if (reqp) {
591 aphysio_unlock(reqp);
592 aio_copyout_result(reqp);
593 mutex_enter(&aiop->aio_mutex);
594 aio_req_free(aiop, reqp);
595 mutex_exit(&aiop->aio_mutex);
596 }
597 return (error);
598 }
599
600 /*
601 * aiowaitn can be used to reap completed asynchronous requests submitted with
602 * lio_listio, aio_read or aio_write.
603 * This function only reaps asynchronous raw I/Os.
604 */
605
606 /*ARGSUSED*/
607 static int
aiowaitn(void * uiocb,uint_t nent,uint_t * nwait,timespec_t * timout)608 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
609 {
610 int error = 0;
611 aio_t *aiop;
612 aio_req_t *reqlist = NULL;
613 caddr_t iocblist = NULL; /* array of iocb ptr's */
614 uint_t waitcnt, cnt = 0; /* iocb cnt */
615 size_t iocbsz; /* users iocb size */
616 size_t riocbsz; /* returned iocb size */
617 int iocb_index = 0;
618 model_t model = get_udatamodel();
619 int blocking = 1;
620 int timecheck;
621 timestruc_t rqtime;
622 timestruc_t *rqtp;
623
624 aiop = curproc->p_aio;
625 if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX)
626 return (EINVAL);
627
628 if (aiop->aio_outstanding == 0)
629 return (EAGAIN);
630
631 if (copyin(nwait, &waitcnt, sizeof (uint_t)))
632 return (EFAULT);
633
634 /* set *nwait to zero, if we must return prematurely */
635 if (copyout(&cnt, nwait, sizeof (uint_t)))
636 return (EFAULT);
637
638 if (waitcnt == 0) {
639 blocking = 0;
640 rqtp = NULL;
641 waitcnt = nent;
642 } else {
643 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
644 if (error)
645 return (error);
646 }
647
648 if (model == DATAMODEL_NATIVE)
649 iocbsz = (sizeof (aiocb_t *) * nent);
650 #ifdef _SYSCALL32_IMPL
651 else
652 iocbsz = (sizeof (caddr32_t) * nent);
653 #endif /* _SYSCALL32_IMPL */
654
655 /*
656 * Only one aio_waitn call is allowed at a time.
657 * The active aio_waitn will collect all requests
658 * out of the "done" list and if necessary it will wait
659 * for some/all pending requests to fulfill the nwait
660 * parameter.
661 * A second or further aio_waitn calls will sleep here
662 * until the active aio_waitn finishes and leaves the kernel
663 * If the second call does not block (poll), then return
664 * immediately with the error code : EAGAIN.
665 * If the second call should block, then sleep here, but
666 * do not touch the timeout. The timeout starts when this
667 * aio_waitn-call becomes active.
668 */
669
670 mutex_enter(&aiop->aio_mutex);
671
672 while (aiop->aio_flags & AIO_WAITN) {
673 if (blocking == 0) {
674 mutex_exit(&aiop->aio_mutex);
675 return (EAGAIN);
676 }
677
678 /* block, no timeout */
679 aiop->aio_flags |= AIO_WAITN_PENDING;
680 if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
681 mutex_exit(&aiop->aio_mutex);
682 return (EINTR);
683 }
684 }
685
686 /*
687 * Establish the absolute future time for the timeout.
688 */
689 if (rqtp) {
690 timestruc_t now;
691 timecheck = timechanged;
692 gethrestime(&now);
693 timespecadd(rqtp, &now);
694 }
695
696 if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
697 kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
698 aiop->aio_iocb = NULL;
699 }
700
701 if (aiop->aio_iocb == NULL) {
702 iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
703 if (iocblist == NULL) {
704 mutex_exit(&aiop->aio_mutex);
705 return (ENOMEM);
706 }
707 aiop->aio_iocb = (aiocb_t **)iocblist;
708 aiop->aio_iocbsz = iocbsz;
709 } else {
710 iocblist = (char *)aiop->aio_iocb;
711 }
712
713 aiop->aio_waitncnt = waitcnt;
714 aiop->aio_flags |= AIO_WAITN;
715
716 for (;;) {
717 /* push requests on poll queue to done queue */
718 if (aiop->aio_pollq) {
719 mutex_exit(&aiop->aio_mutex);
720 aio_cleanup(0);
721 mutex_enter(&aiop->aio_mutex);
722 }
723
724 /* check for requests on done queue */
725 if (aiop->aio_doneq) {
726 cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
727 aiop->aio_waitncnt = waitcnt - cnt;
728 }
729
730 /* user-level done queue might not be empty */
731 if (aiop->aio_notifycnt > 0) {
732 aiop->aio_notifycnt--;
733 error = 0;
734 break;
735 }
736
737 /*
738 * if we are here second time as a result of timer
739 * expiration, we reset error if there are enough
740 * aiocb's to satisfy request.
741 * We return also if all requests are already done
742 * and we picked up the whole done queue.
743 */
744
745 if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
746 aiop->aio_doneq == NULL)) {
747 error = 0;
748 break;
749 }
750
751 if ((cnt < waitcnt) && blocking) {
752 int rval = cv_waituntil_sig(&aiop->aio_waitcv,
753 &aiop->aio_mutex, rqtp, timecheck);
754 if (rval > 0)
755 continue;
756 if (rval < 0) {
757 error = ETIME;
758 blocking = 0;
759 continue;
760 }
761 error = EINTR;
762 }
763 break;
764 }
765
766 mutex_exit(&aiop->aio_mutex);
767
768 if (cnt > 0) {
769
770 iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
771 aiop, model);
772
773 if (model == DATAMODEL_NATIVE)
774 riocbsz = (sizeof (aiocb_t *) * cnt);
775 #ifdef _SYSCALL32_IMPL
776 else
777 riocbsz = (sizeof (caddr32_t) * cnt);
778 #endif /* _SYSCALL32_IMPL */
779
780 if (copyout(iocblist, uiocb, riocbsz) ||
781 copyout(&cnt, nwait, sizeof (uint_t)))
782 error = EFAULT;
783 }
784
785 /* check if there is another thread waiting for execution */
786 mutex_enter(&aiop->aio_mutex);
787 aiop->aio_flags &= ~AIO_WAITN;
788 if (aiop->aio_flags & AIO_WAITN_PENDING) {
789 aiop->aio_flags &= ~AIO_WAITN_PENDING;
790 cv_signal(&aiop->aio_waitncv);
791 }
792 mutex_exit(&aiop->aio_mutex);
793
794 return (error);
795 }
796
797 /*
798 * aio_unlock_requests
799 * copyouts the result of the request as well as the return value.
800 * It builds the list of completed asynchronous requests,
801 * unlocks the allocated memory ranges and
802 * put the aio request structure back into the free list.
803 */
804
805 static int
aio_unlock_requests(caddr_t iocblist,int iocb_index,aio_req_t * reqlist,aio_t * aiop,model_t model)806 aio_unlock_requests(
807 caddr_t iocblist,
808 int iocb_index,
809 aio_req_t *reqlist,
810 aio_t *aiop,
811 model_t model)
812 {
813 aio_req_t *reqp, *nreqp;
814
815 if (model == DATAMODEL_NATIVE) {
816 for (reqp = reqlist; reqp != NULL; reqp = nreqp) {
817 (((caddr_t *)iocblist)[iocb_index++]) =
818 reqp->aio_req_iocb.iocb;
819 nreqp = reqp->aio_req_next;
820 aphysio_unlock(reqp);
821 aio_copyout_result(reqp);
822 mutex_enter(&aiop->aio_mutex);
823 aio_req_free(aiop, reqp);
824 mutex_exit(&aiop->aio_mutex);
825 }
826 }
827 #ifdef _SYSCALL32_IMPL
828 else {
829 for (reqp = reqlist; reqp != NULL; reqp = nreqp) {
830 ((caddr32_t *)iocblist)[iocb_index++] =
831 reqp->aio_req_iocb.iocb32;
832 nreqp = reqp->aio_req_next;
833 aphysio_unlock(reqp);
834 aio_copyout_result(reqp);
835 mutex_enter(&aiop->aio_mutex);
836 aio_req_free(aiop, reqp);
837 mutex_exit(&aiop->aio_mutex);
838 }
839 }
840 #endif /* _SYSCALL32_IMPL */
841 return (iocb_index);
842 }
843
844 /*
845 * aio_reqlist_concat
846 * moves "max" elements from the done queue to the reqlist queue and removes
847 * the AIO_DONEQ flag.
848 * - reqlist queue is a simple linked list
849 * - done queue is a double linked list
850 */
851
852 static int
aio_reqlist_concat(aio_t * aiop,aio_req_t ** reqlist,int max)853 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
854 {
855 aio_req_t *q2, *q2work, *list;
856 int count = 0;
857
858 list = *reqlist;
859 q2 = aiop->aio_doneq;
860 q2work = q2;
861 while (max-- > 0) {
862 q2work->aio_req_flags &= ~AIO_DONEQ;
863 q2work = q2work->aio_req_next;
864 count++;
865 if (q2work == q2)
866 break;
867 }
868
869 if (q2work == q2) {
870 /* all elements revised */
871 q2->aio_req_prev->aio_req_next = list;
872 list = q2;
873 aiop->aio_doneq = NULL;
874 } else {
875 /*
876 * max < elements in the doneq
877 * detach only the required amount of elements
878 * out of the doneq
879 */
880 q2work->aio_req_prev->aio_req_next = list;
881 list = q2;
882
883 aiop->aio_doneq = q2work;
884 q2work->aio_req_prev = q2->aio_req_prev;
885 q2->aio_req_prev->aio_req_next = q2work;
886 }
887 *reqlist = list;
888 return (count);
889 }
890
891 /*ARGSUSED*/
892 static int
aiosuspend(void * aiocb,int nent,struct timespec * timout,int flag,long * rval,int run_mode)893 aiosuspend(
894 void *aiocb,
895 int nent,
896 struct timespec *timout,
897 int flag,
898 long *rval,
899 int run_mode)
900 {
901 int error;
902 aio_t *aiop;
903 aio_req_t *reqp, *found, *next;
904 caddr_t cbplist = NULL;
905 aiocb_t *cbp, **ucbp;
906 #ifdef _SYSCALL32_IMPL
907 aiocb32_t *cbp32;
908 caddr32_t *ucbp32;
909 #endif /* _SYSCALL32_IMPL */
910 aiocb64_32_t *cbp64;
911 int rv;
912 int i;
913 size_t ssize;
914 model_t model = get_udatamodel();
915 int blocking;
916 int timecheck;
917 timestruc_t rqtime;
918 timestruc_t *rqtp;
919
920 aiop = curproc->p_aio;
921 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
922 return (EINVAL);
923
924 /*
925 * Establish the absolute future time for the timeout.
926 */
927 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
928 if (error)
929 return (error);
930 if (rqtp) {
931 timestruc_t now;
932 timecheck = timechanged;
933 gethrestime(&now);
934 timespecadd(rqtp, &now);
935 }
936
937 /*
938 * If we are not blocking and there's no IO complete
939 * skip aiocb copyin.
940 */
941 if (!blocking && (aiop->aio_pollq == NULL) &&
942 (aiop->aio_doneq == NULL)) {
943 return (EAGAIN);
944 }
945
946 if (model == DATAMODEL_NATIVE)
947 ssize = (sizeof (aiocb_t *) * nent);
948 #ifdef _SYSCALL32_IMPL
949 else
950 ssize = (sizeof (caddr32_t) * nent);
951 #endif /* _SYSCALL32_IMPL */
952
953 cbplist = kmem_alloc(ssize, KM_NOSLEEP);
954 if (cbplist == NULL)
955 return (ENOMEM);
956
957 if (copyin(aiocb, cbplist, ssize)) {
958 error = EFAULT;
959 goto done;
960 }
961
962 found = NULL;
963 /*
964 * we need to get the aio_cleanupq_mutex since we call
965 * aio_req_done().
966 */
967 mutex_enter(&aiop->aio_cleanupq_mutex);
968 mutex_enter(&aiop->aio_mutex);
969 for (;;) {
970 /* push requests on poll queue to done queue */
971 if (aiop->aio_pollq) {
972 mutex_exit(&aiop->aio_mutex);
973 mutex_exit(&aiop->aio_cleanupq_mutex);
974 aio_cleanup(0);
975 mutex_enter(&aiop->aio_cleanupq_mutex);
976 mutex_enter(&aiop->aio_mutex);
977 }
978 /* check for requests on done queue */
979 if (aiop->aio_doneq) {
980 if (model == DATAMODEL_NATIVE)
981 ucbp = (aiocb_t **)cbplist;
982 #ifdef _SYSCALL32_IMPL
983 else
984 ucbp32 = (caddr32_t *)cbplist;
985 #endif /* _SYSCALL32_IMPL */
986 for (i = 0; i < nent; i++) {
987 if (model == DATAMODEL_NATIVE) {
988 if ((cbp = *ucbp++) == NULL)
989 continue;
990 if (run_mode != AIO_LARGEFILE)
991 reqp = aio_req_done(
992 &cbp->aio_resultp);
993 else {
994 cbp64 = (aiocb64_32_t *)cbp;
995 reqp = aio_req_done(
996 &cbp64->aio_resultp);
997 }
998 }
999 #ifdef _SYSCALL32_IMPL
1000 else {
1001 if (run_mode == AIO_32) {
1002 if ((cbp32 =
1003 (aiocb32_t *)(uintptr_t)
1004 *ucbp32++) == NULL)
1005 continue;
1006 reqp = aio_req_done(
1007 &cbp32->aio_resultp);
1008 } else if (run_mode == AIO_LARGEFILE) {
1009 if ((cbp64 =
1010 (aiocb64_32_t *)(uintptr_t)
1011 *ucbp32++) == NULL)
1012 continue;
1013 reqp = aio_req_done(
1014 &cbp64->aio_resultp);
1015 }
1016
1017 }
1018 #endif /* _SYSCALL32_IMPL */
1019 if (reqp) {
1020 reqp->aio_req_next = found;
1021 found = reqp;
1022 }
1023 if (aiop->aio_doneq == NULL)
1024 break;
1025 }
1026 if (found)
1027 break;
1028 }
1029 if (aiop->aio_notifycnt > 0) {
1030 /*
1031 * nothing on the kernel's queue. the user
1032 * has notified the kernel that it has items
1033 * on a user-level queue.
1034 */
1035 aiop->aio_notifycnt--;
1036 *rval = 1;
1037 error = 0;
1038 break;
1039 }
1040 /* don't block if nothing is outstanding */
1041 if (aiop->aio_outstanding == 0) {
1042 error = EAGAIN;
1043 break;
1044 }
1045 if (blocking) {
1046 /*
1047 * drop the aio_cleanupq_mutex as we are
1048 * going to block.
1049 */
1050 mutex_exit(&aiop->aio_cleanupq_mutex);
1051 rv = cv_waituntil_sig(&aiop->aio_waitcv,
1052 &aiop->aio_mutex, rqtp, timecheck);
1053 /*
1054 * we have to drop aio_mutex and
1055 * grab it in the right order.
1056 */
1057 mutex_exit(&aiop->aio_mutex);
1058 mutex_enter(&aiop->aio_cleanupq_mutex);
1059 mutex_enter(&aiop->aio_mutex);
1060 if (rv > 0) /* check done queue again */
1061 continue;
1062 if (rv == 0) /* interrupted by a signal */
1063 error = EINTR;
1064 else /* timer expired */
1065 error = ETIME;
1066 } else {
1067 error = EAGAIN;
1068 }
1069 break;
1070 }
1071 mutex_exit(&aiop->aio_mutex);
1072 mutex_exit(&aiop->aio_cleanupq_mutex);
1073 for (reqp = found; reqp != NULL; reqp = next) {
1074 next = reqp->aio_req_next;
1075 aphysio_unlock(reqp);
1076 aio_copyout_result(reqp);
1077 mutex_enter(&aiop->aio_mutex);
1078 aio_req_free(aiop, reqp);
1079 mutex_exit(&aiop->aio_mutex);
1080 }
1081 done:
1082 kmem_free(cbplist, ssize);
1083 return (error);
1084 }
1085
1086 /*
1087 * initialize aio by allocating an aio_t struct for this
1088 * process.
1089 */
1090 static int
aioinit(void)1091 aioinit(void)
1092 {
1093 proc_t *p = curproc;
1094 aio_t *aiop;
1095 mutex_enter(&p->p_lock);
1096 if ((aiop = p->p_aio) == NULL) {
1097 aiop = aio_aiop_alloc();
1098 p->p_aio = aiop;
1099 }
1100 mutex_exit(&p->p_lock);
1101 if (aiop == NULL)
1102 return (ENOMEM);
1103 return (0);
1104 }
1105
1106 /*
1107 * start a special thread that will cleanup after aio requests
1108 * that are preventing a segment from being unmapped. as_unmap()
1109 * blocks until all phsyio to this segment is completed. this
1110 * doesn't happen until all the pages in this segment are not
1111 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
1112 * requests still outstanding. this special thread will make sure
1113 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
1114 *
1115 * this function will return an error if the process has only
1116 * one LWP. the assumption is that the caller is a separate LWP
1117 * that remains blocked in the kernel for the life of this process.
1118 */
1119 static int
aiostart(void)1120 aiostart(void)
1121 {
1122 proc_t *p = curproc;
1123 aio_t *aiop;
1124 int first, error = 0;
1125
1126 if (p->p_lwpcnt == 1)
1127 return (EDEADLK);
1128 mutex_enter(&p->p_lock);
1129 if ((aiop = p->p_aio) == NULL)
1130 error = EINVAL;
1131 else {
1132 first = aiop->aio_ok;
1133 if (aiop->aio_ok == 0)
1134 aiop->aio_ok = 1;
1135 }
1136 mutex_exit(&p->p_lock);
1137 if (error == 0 && first == 0) {
1138 return (aio_cleanup_thread(aiop));
1139 /* should return only to exit */
1140 }
1141 return (error);
1142 }
1143
1144 /*
1145 * Associate an aiocb with a port.
1146 * This function is used by aiorw() to associate a transaction with a port.
1147 * Allocate an event port structure (port_alloc_event()) and store the
1148 * delivered user pointer (portnfy_user) in the portkev_user field of the
1149 * port_kevent_t structure..
1150 * The aio_req_portkev pointer in the aio_req_t structure was added to identify
1151 * the port association.
1152 */
1153
1154 static int
aio_req_assoc_port_rw(port_notify_t * pntfy,aiocb_t * cbp,aio_req_t * reqp,int event)1155 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp,
1156 aio_req_t *reqp, int event)
1157 {
1158 port_kevent_t *pkevp = NULL;
1159 int error;
1160
1161 error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
1162 PORT_SOURCE_AIO, &pkevp);
1163 if (error) {
1164 if ((error == ENOMEM) || (error == EAGAIN))
1165 error = EAGAIN;
1166 else
1167 error = EINVAL;
1168 } else {
1169 port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
1170 aio_port_callback, reqp);
1171 pkevp->portkev_events = event;
1172 reqp->aio_req_portkev = pkevp;
1173 reqp->aio_req_port = pntfy->portnfy_port;
1174 }
1175 return (error);
1176 }
1177
1178 #ifdef _LP64
1179
1180 /*
1181 * Asynchronous list IO. A chain of aiocb's are copied in
1182 * one at a time. If the aiocb is invalid, it is skipped.
1183 * For each aiocb, the appropriate driver entry point is
1184 * called. Optimize for the common case where the list
1185 * of requests is to the same file descriptor.
1186 *
1187 * One possible optimization is to define a new driver entry
1188 * point that supports a list of IO requests. Whether this
1189 * improves performance depends somewhat on the driver's
1190 * locking strategy. Processing a list could adversely impact
1191 * the driver's interrupt latency.
1192 */
1193 static int
alio(int mode_arg,aiocb_t ** aiocb_arg,int nent,struct sigevent * sigev)1194 alio(
1195 int mode_arg,
1196 aiocb_t **aiocb_arg,
1197 int nent,
1198 struct sigevent *sigev)
1199 {
1200 file_t *fp;
1201 file_t *prev_fp = NULL;
1202 int prev_mode = -1;
1203 struct vnode *vp;
1204 aio_lio_t *head;
1205 aio_req_t *reqp;
1206 aio_t *aiop;
1207 caddr_t cbplist;
1208 aiocb_t cb;
1209 aiocb_t *aiocb = &cb;
1210 aiocb_t *cbp;
1211 aiocb_t **ucbp;
1212 struct sigevent sigevk;
1213 sigqueue_t *sqp;
1214 int (*aio_func)();
1215 int mode;
1216 int error = 0;
1217 int aio_errors = 0;
1218 int i;
1219 size_t ssize;
1220 int deadhead = 0;
1221 int aio_notsupported = 0;
1222 int lio_head_port;
1223 int aio_port;
1224 int aio_thread;
1225 port_kevent_t *pkevtp = NULL;
1226 int portused = 0;
1227 port_notify_t pnotify;
1228 int event;
1229
1230 aiop = curproc->p_aio;
1231 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1232 return (EINVAL);
1233
1234 ssize = (sizeof (aiocb_t *) * nent);
1235 cbplist = kmem_alloc(ssize, KM_SLEEP);
1236 ucbp = (aiocb_t **)cbplist;
1237
1238 if (copyin(aiocb_arg, cbplist, ssize) ||
1239 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) {
1240 kmem_free(cbplist, ssize);
1241 return (EFAULT);
1242 }
1243
1244 /* Event Ports */
1245 if (sigev &&
1246 (sigevk.sigev_notify == SIGEV_THREAD ||
1247 sigevk.sigev_notify == SIGEV_PORT)) {
1248 if (sigevk.sigev_notify == SIGEV_THREAD) {
1249 pnotify.portnfy_port = sigevk.sigev_signo;
1250 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
1251 } else if (copyin(sigevk.sigev_value.sival_ptr,
1252 &pnotify, sizeof (pnotify))) {
1253 kmem_free(cbplist, ssize);
1254 return (EFAULT);
1255 }
1256 error = port_alloc_event(pnotify.portnfy_port,
1257 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
1258 if (error) {
1259 if (error == ENOMEM || error == EAGAIN)
1260 error = EAGAIN;
1261 else
1262 error = EINVAL;
1263 kmem_free(cbplist, ssize);
1264 return (error);
1265 }
1266 lio_head_port = pnotify.portnfy_port;
1267 portused = 1;
1268 }
1269
1270 /*
1271 * a list head should be allocated if notification is
1272 * enabled for this list.
1273 */
1274 head = NULL;
1275
1276 if (mode_arg == LIO_WAIT || sigev) {
1277 mutex_enter(&aiop->aio_mutex);
1278 error = aio_lio_alloc(&head);
1279 mutex_exit(&aiop->aio_mutex);
1280 if (error)
1281 goto done;
1282 deadhead = 1;
1283 head->lio_nent = nent;
1284 head->lio_refcnt = nent;
1285 head->lio_port = -1;
1286 head->lio_portkev = NULL;
1287 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
1288 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
1289 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
1290 if (sqp == NULL) {
1291 error = EAGAIN;
1292 goto done;
1293 }
1294 sqp->sq_func = NULL;
1295 sqp->sq_next = NULL;
1296 sqp->sq_info.si_code = SI_ASYNCIO;
1297 sqp->sq_info.si_pid = curproc->p_pid;
1298 sqp->sq_info.si_ctid = PRCTID(curproc);
1299 sqp->sq_info.si_zoneid = getzoneid();
1300 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
1301 sqp->sq_info.si_signo = sigevk.sigev_signo;
1302 sqp->sq_info.si_value = sigevk.sigev_value;
1303 head->lio_sigqp = sqp;
1304 } else {
1305 head->lio_sigqp = NULL;
1306 }
1307 if (pkevtp) {
1308 /*
1309 * Prepare data to send when list of aiocb's
1310 * has completed.
1311 */
1312 port_init_event(pkevtp, (uintptr_t)sigev,
1313 (void *)(uintptr_t)pnotify.portnfy_user,
1314 NULL, head);
1315 pkevtp->portkev_events = AIOLIO;
1316 head->lio_portkev = pkevtp;
1317 head->lio_port = pnotify.portnfy_port;
1318 }
1319 }
1320
1321 for (i = 0; i < nent; i++, ucbp++) {
1322
1323 cbp = *ucbp;
1324 /* skip entry if it can't be copied. */
1325 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
1326 if (head) {
1327 mutex_enter(&aiop->aio_mutex);
1328 head->lio_nent--;
1329 head->lio_refcnt--;
1330 mutex_exit(&aiop->aio_mutex);
1331 }
1332 continue;
1333 }
1334
1335 /* skip if opcode for aiocb is LIO_NOP */
1336 mode = aiocb->aio_lio_opcode;
1337 if (mode == LIO_NOP) {
1338 cbp = NULL;
1339 if (head) {
1340 mutex_enter(&aiop->aio_mutex);
1341 head->lio_nent--;
1342 head->lio_refcnt--;
1343 mutex_exit(&aiop->aio_mutex);
1344 }
1345 continue;
1346 }
1347
1348 /* increment file descriptor's ref count. */
1349 if ((fp = getf(aiocb->aio_fildes)) == NULL) {
1350 lio_set_uerror(&cbp->aio_resultp, EBADF);
1351 if (head) {
1352 mutex_enter(&aiop->aio_mutex);
1353 head->lio_nent--;
1354 head->lio_refcnt--;
1355 mutex_exit(&aiop->aio_mutex);
1356 }
1357 aio_errors++;
1358 continue;
1359 }
1360
1361 /*
1362 * check the permission of the partition
1363 */
1364 if ((fp->f_flag & mode) == 0) {
1365 releasef(aiocb->aio_fildes);
1366 lio_set_uerror(&cbp->aio_resultp, EBADF);
1367 if (head) {
1368 mutex_enter(&aiop->aio_mutex);
1369 head->lio_nent--;
1370 head->lio_refcnt--;
1371 mutex_exit(&aiop->aio_mutex);
1372 }
1373 aio_errors++;
1374 continue;
1375 }
1376
1377 /*
1378 * common case where requests are to the same fd
1379 * for the same r/w operation.
1380 * for UFS, need to set EBADFD
1381 */
1382 vp = fp->f_vnode;
1383 if (fp != prev_fp || mode != prev_mode) {
1384 aio_func = check_vp(vp, mode);
1385 if (aio_func == NULL) {
1386 prev_fp = NULL;
1387 releasef(aiocb->aio_fildes);
1388 lio_set_uerror(&cbp->aio_resultp, EBADFD);
1389 aio_notsupported++;
1390 if (head) {
1391 mutex_enter(&aiop->aio_mutex);
1392 head->lio_nent--;
1393 head->lio_refcnt--;
1394 mutex_exit(&aiop->aio_mutex);
1395 }
1396 continue;
1397 } else {
1398 prev_fp = fp;
1399 prev_mode = mode;
1400 }
1401 }
1402
1403 error = aio_req_setup(&reqp, aiop, aiocb,
1404 &cbp->aio_resultp, vp, 0);
1405 if (error) {
1406 releasef(aiocb->aio_fildes);
1407 lio_set_uerror(&cbp->aio_resultp, error);
1408 if (head) {
1409 mutex_enter(&aiop->aio_mutex);
1410 head->lio_nent--;
1411 head->lio_refcnt--;
1412 mutex_exit(&aiop->aio_mutex);
1413 }
1414 aio_errors++;
1415 continue;
1416 }
1417
1418 reqp->aio_req_lio = head;
1419 deadhead = 0;
1420
1421 /*
1422 * Set the errno field now before sending the request to
1423 * the driver to avoid a race condition
1424 */
1425 (void) suword32(&cbp->aio_resultp.aio_errno,
1426 EINPROGRESS);
1427
1428 reqp->aio_req_iocb.iocb = (caddr_t)cbp;
1429
1430 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
1431 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
1432 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
1433 if (aio_port | aio_thread) {
1434 port_kevent_t *lpkevp;
1435 /*
1436 * Prepare data to send with each aiocb completed.
1437 */
1438 if (aio_port) {
1439 void *paddr =
1440 aiocb->aio_sigevent.sigev_value.sival_ptr;
1441 if (copyin(paddr, &pnotify, sizeof (pnotify)))
1442 error = EFAULT;
1443 } else { /* aio_thread */
1444 pnotify.portnfy_port =
1445 aiocb->aio_sigevent.sigev_signo;
1446 pnotify.portnfy_user =
1447 aiocb->aio_sigevent.sigev_value.sival_ptr;
1448 }
1449 if (error)
1450 /* EMPTY */;
1451 else if (pkevtp != NULL &&
1452 pnotify.portnfy_port == lio_head_port)
1453 error = port_dup_event(pkevtp, &lpkevp,
1454 PORT_ALLOC_DEFAULT);
1455 else
1456 error = port_alloc_event(pnotify.portnfy_port,
1457 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
1458 &lpkevp);
1459 if (error == 0) {
1460 port_init_event(lpkevp, (uintptr_t)cbp,
1461 (void *)(uintptr_t)pnotify.portnfy_user,
1462 aio_port_callback, reqp);
1463 lpkevp->portkev_events = event;
1464 reqp->aio_req_portkev = lpkevp;
1465 reqp->aio_req_port = pnotify.portnfy_port;
1466 }
1467 }
1468
1469 /*
1470 * send the request to driver.
1471 */
1472 if (error == 0) {
1473 if (aiocb->aio_nbytes == 0) {
1474 clear_active_fd(aiocb->aio_fildes);
1475 aio_zerolen(reqp);
1476 continue;
1477 }
1478 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
1479 CRED());
1480 }
1481
1482 /*
1483 * the fd's ref count is not decremented until the IO has
1484 * completed unless there was an error.
1485 */
1486 if (error) {
1487 releasef(aiocb->aio_fildes);
1488 lio_set_uerror(&cbp->aio_resultp, error);
1489 if (head) {
1490 mutex_enter(&aiop->aio_mutex);
1491 head->lio_nent--;
1492 head->lio_refcnt--;
1493 mutex_exit(&aiop->aio_mutex);
1494 }
1495 if (error == ENOTSUP)
1496 aio_notsupported++;
1497 else
1498 aio_errors++;
1499 lio_set_error(reqp, portused);
1500 } else {
1501 clear_active_fd(aiocb->aio_fildes);
1502 }
1503 }
1504
1505 if (aio_notsupported) {
1506 error = ENOTSUP;
1507 } else if (aio_errors) {
1508 /*
1509 * return EIO if any request failed
1510 */
1511 error = EIO;
1512 }
1513
1514 if (mode_arg == LIO_WAIT) {
1515 mutex_enter(&aiop->aio_mutex);
1516 while (head->lio_refcnt > 0) {
1517 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1518 mutex_exit(&aiop->aio_mutex);
1519 error = EINTR;
1520 goto done;
1521 }
1522 }
1523 mutex_exit(&aiop->aio_mutex);
1524 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
1525 }
1526
1527 done:
1528 kmem_free(cbplist, ssize);
1529 if (deadhead) {
1530 if (head->lio_sigqp)
1531 kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
1532 if (head->lio_portkev)
1533 port_free_event(head->lio_portkev);
1534 kmem_free(head, sizeof (aio_lio_t));
1535 }
1536 return (error);
1537 }
1538
1539 #endif /* _LP64 */
1540
1541 /*
1542 * Asynchronous list IO.
1543 * If list I/O is called with LIO_WAIT it can still return
1544 * before all the I/O's are completed if a signal is caught
1545 * or if the list include UFS I/O requests. If this happens,
1546 * libaio will call aliowait() to wait for the I/O's to
1547 * complete
1548 */
1549 /*ARGSUSED*/
1550 static int
aliowait(int mode,void * aiocb,int nent,void * sigev,int run_mode)1551 aliowait(
1552 int mode,
1553 void *aiocb,
1554 int nent,
1555 void *sigev,
1556 int run_mode)
1557 {
1558 aio_lio_t *head;
1559 aio_t *aiop;
1560 caddr_t cbplist;
1561 aiocb_t *cbp, **ucbp;
1562 #ifdef _SYSCALL32_IMPL
1563 aiocb32_t *cbp32;
1564 caddr32_t *ucbp32;
1565 aiocb64_32_t *cbp64;
1566 #endif
1567 int error = 0;
1568 int i;
1569 size_t ssize = 0;
1570 model_t model = get_udatamodel();
1571
1572 aiop = curproc->p_aio;
1573 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1574 return (EINVAL);
1575
1576 if (model == DATAMODEL_NATIVE)
1577 ssize = (sizeof (aiocb_t *) * nent);
1578 #ifdef _SYSCALL32_IMPL
1579 else
1580 ssize = (sizeof (caddr32_t) * nent);
1581 #endif /* _SYSCALL32_IMPL */
1582
1583 if (ssize == 0)
1584 return (EINVAL);
1585
1586 cbplist = kmem_alloc(ssize, KM_SLEEP);
1587
1588 if (model == DATAMODEL_NATIVE)
1589 ucbp = (aiocb_t **)cbplist;
1590 #ifdef _SYSCALL32_IMPL
1591 else
1592 ucbp32 = (caddr32_t *)cbplist;
1593 #endif /* _SYSCALL32_IMPL */
1594
1595 if (copyin(aiocb, cbplist, ssize)) {
1596 error = EFAULT;
1597 goto done;
1598 }
1599
1600 /*
1601 * To find the list head, we go through the
1602 * list of aiocb structs, find the request
1603 * its for, then get the list head that reqp
1604 * points to
1605 */
1606 head = NULL;
1607
1608 for (i = 0; i < nent; i++) {
1609 if (model == DATAMODEL_NATIVE) {
1610 /*
1611 * Since we are only checking for a NULL pointer
1612 * Following should work on both native data sizes
1613 * as well as for largefile aiocb.
1614 */
1615 if ((cbp = *ucbp++) == NULL)
1616 continue;
1617 if (run_mode != AIO_LARGEFILE)
1618 if (head = aio_list_get(&cbp->aio_resultp))
1619 break;
1620 else {
1621 /*
1622 * This is a case when largefile call is
1623 * made on 32 bit kernel.
1624 * Treat each pointer as pointer to
1625 * aiocb64_32
1626 */
1627 if (head = aio_list_get((aio_result_t *)
1628 &(((aiocb64_32_t *)cbp)->aio_resultp)))
1629 break;
1630 }
1631 }
1632 #ifdef _SYSCALL32_IMPL
1633 else {
1634 if (run_mode == AIO_LARGEFILE) {
1635 if ((cbp64 = (aiocb64_32_t *)
1636 (uintptr_t)*ucbp32++) == NULL)
1637 continue;
1638 if (head = aio_list_get((aio_result_t *)
1639 &cbp64->aio_resultp))
1640 break;
1641 } else if (run_mode == AIO_32) {
1642 if ((cbp32 = (aiocb32_t *)
1643 (uintptr_t)*ucbp32++) == NULL)
1644 continue;
1645 if (head = aio_list_get((aio_result_t *)
1646 &cbp32->aio_resultp))
1647 break;
1648 }
1649 }
1650 #endif /* _SYSCALL32_IMPL */
1651 }
1652
1653 if (head == NULL) {
1654 error = EINVAL;
1655 goto done;
1656 }
1657
1658 mutex_enter(&aiop->aio_mutex);
1659 while (head->lio_refcnt > 0) {
1660 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1661 mutex_exit(&aiop->aio_mutex);
1662 error = EINTR;
1663 goto done;
1664 }
1665 }
1666 mutex_exit(&aiop->aio_mutex);
1667 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
1668 done:
1669 kmem_free(cbplist, ssize);
1670 return (error);
1671 }
1672
1673 aio_lio_t *
aio_list_get(aio_result_t * resultp)1674 aio_list_get(aio_result_t *resultp)
1675 {
1676 aio_lio_t *head = NULL;
1677 aio_t *aiop;
1678 aio_req_t **bucket;
1679 aio_req_t *reqp;
1680 long index;
1681
1682 aiop = curproc->p_aio;
1683 if (aiop == NULL)
1684 return (NULL);
1685
1686 if (resultp) {
1687 index = AIO_HASH(resultp);
1688 bucket = &aiop->aio_hash[index];
1689 for (reqp = *bucket; reqp != NULL;
1690 reqp = reqp->aio_hash_next) {
1691 if (reqp->aio_req_resultp == resultp) {
1692 head = reqp->aio_req_lio;
1693 return (head);
1694 }
1695 }
1696 }
1697 return (NULL);
1698 }
1699
1700
1701 static void
lio_set_uerror(void * resultp,int error)1702 lio_set_uerror(void *resultp, int error)
1703 {
1704 /*
1705 * the resultp field is a pointer to where the
1706 * error should be written out to the user's
1707 * aiocb.
1708 *
1709 */
1710 if (get_udatamodel() == DATAMODEL_NATIVE) {
1711 (void) sulword(&((aio_result_t *)resultp)->aio_return,
1712 (ssize_t)-1);
1713 (void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1714 }
1715 #ifdef _SYSCALL32_IMPL
1716 else {
1717 (void) suword32(&((aio_result32_t *)resultp)->aio_return,
1718 (uint_t)-1);
1719 (void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1720 }
1721 #endif /* _SYSCALL32_IMPL */
1722 }
1723
1724 /*
1725 * do cleanup completion for all requests in list. memory for
1726 * each request is also freed.
1727 */
1728 static void
alio_cleanup(aio_t * aiop,aiocb_t ** cbp,int nent,int run_mode)1729 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
1730 {
1731 int i;
1732 aio_req_t *reqp;
1733 aio_result_t *resultp;
1734 aiocb64_32_t *aiocb_64;
1735
1736 for (i = 0; i < nent; i++) {
1737 if (get_udatamodel() == DATAMODEL_NATIVE) {
1738 if (cbp[i] == NULL)
1739 continue;
1740 if (run_mode == AIO_LARGEFILE) {
1741 aiocb_64 = (aiocb64_32_t *)cbp[i];
1742 resultp = (aio_result_t *)
1743 &aiocb_64->aio_resultp;
1744 } else
1745 resultp = &cbp[i]->aio_resultp;
1746 }
1747 #ifdef _SYSCALL32_IMPL
1748 else {
1749 aiocb32_t *aiocb_32;
1750 caddr32_t *cbp32;
1751
1752 cbp32 = (caddr32_t *)cbp;
1753 if (cbp32[i] == NULL)
1754 continue;
1755 if (run_mode == AIO_32) {
1756 aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
1757 resultp = (aio_result_t *)&aiocb_32->
1758 aio_resultp;
1759 } else if (run_mode == AIO_LARGEFILE) {
1760 aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
1761 resultp = (aio_result_t *)&aiocb_64->
1762 aio_resultp;
1763 }
1764 }
1765 #endif /* _SYSCALL32_IMPL */
1766 /*
1767 * we need to get the aio_cleanupq_mutex since we call
1768 * aio_req_done().
1769 */
1770 mutex_enter(&aiop->aio_cleanupq_mutex);
1771 mutex_enter(&aiop->aio_mutex);
1772 reqp = aio_req_done(resultp);
1773 mutex_exit(&aiop->aio_mutex);
1774 mutex_exit(&aiop->aio_cleanupq_mutex);
1775 if (reqp != NULL) {
1776 aphysio_unlock(reqp);
1777 aio_copyout_result(reqp);
1778 mutex_enter(&aiop->aio_mutex);
1779 aio_req_free(aiop, reqp);
1780 mutex_exit(&aiop->aio_mutex);
1781 }
1782 }
1783 }
1784
1785 /*
1786 * Write out the results for an aio request that is done.
1787 */
1788 static int
aioerror(void * cb,int run_mode)1789 aioerror(void *cb, int run_mode)
1790 {
1791 aio_result_t *resultp;
1792 aio_t *aiop;
1793 aio_req_t *reqp;
1794 int retval;
1795
1796 aiop = curproc->p_aio;
1797 if (aiop == NULL || cb == NULL)
1798 return (EINVAL);
1799
1800 if (get_udatamodel() == DATAMODEL_NATIVE) {
1801 if (run_mode == AIO_LARGEFILE)
1802 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1803 aio_resultp;
1804 else
1805 resultp = &((aiocb_t *)cb)->aio_resultp;
1806 }
1807 #ifdef _SYSCALL32_IMPL
1808 else {
1809 if (run_mode == AIO_LARGEFILE)
1810 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1811 aio_resultp;
1812 else if (run_mode == AIO_32)
1813 resultp = (aio_result_t *)&((aiocb32_t *)cb)->
1814 aio_resultp;
1815 }
1816 #endif /* _SYSCALL32_IMPL */
1817 /*
1818 * we need to get the aio_cleanupq_mutex since we call
1819 * aio_req_find().
1820 */
1821 mutex_enter(&aiop->aio_cleanupq_mutex);
1822 mutex_enter(&aiop->aio_mutex);
1823 retval = aio_req_find(resultp, &reqp);
1824 mutex_exit(&aiop->aio_mutex);
1825 mutex_exit(&aiop->aio_cleanupq_mutex);
1826 if (retval == 0) {
1827 aphysio_unlock(reqp);
1828 aio_copyout_result(reqp);
1829 mutex_enter(&aiop->aio_mutex);
1830 aio_req_free(aiop, reqp);
1831 mutex_exit(&aiop->aio_mutex);
1832 return (0);
1833 } else if (retval == 1)
1834 return (EINPROGRESS);
1835 else if (retval == 2)
1836 return (EINVAL);
1837 return (0);
1838 }
1839
1840 /*
1841 * aio_cancel - if no requests outstanding,
1842 * return AIO_ALLDONE
1843 * else
1844 * return AIO_NOTCANCELED
1845 */
1846 static int
aio_cancel(int fildes,void * cb,long * rval,int run_mode)1847 aio_cancel(
1848 int fildes,
1849 void *cb,
1850 long *rval,
1851 int run_mode)
1852 {
1853 aio_t *aiop;
1854 void *resultp;
1855 int index;
1856 aio_req_t **bucket;
1857 aio_req_t *ent;
1858
1859
1860 /*
1861 * Verify valid file descriptor
1862 */
1863 if ((getf(fildes)) == NULL) {
1864 return (EBADF);
1865 }
1866 releasef(fildes);
1867
1868 aiop = curproc->p_aio;
1869 if (aiop == NULL)
1870 return (EINVAL);
1871
1872 if (aiop->aio_outstanding == 0) {
1873 *rval = AIO_ALLDONE;
1874 return (0);
1875 }
1876
1877 mutex_enter(&aiop->aio_mutex);
1878 if (cb != NULL) {
1879 if (get_udatamodel() == DATAMODEL_NATIVE) {
1880 if (run_mode == AIO_LARGEFILE)
1881 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1882 ->aio_resultp;
1883 else
1884 resultp = &((aiocb_t *)cb)->aio_resultp;
1885 }
1886 #ifdef _SYSCALL32_IMPL
1887 else {
1888 if (run_mode == AIO_LARGEFILE)
1889 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1890 ->aio_resultp;
1891 else if (run_mode == AIO_32)
1892 resultp = (aio_result_t *)&((aiocb32_t *)cb)
1893 ->aio_resultp;
1894 }
1895 #endif /* _SYSCALL32_IMPL */
1896 index = AIO_HASH(resultp);
1897 bucket = &aiop->aio_hash[index];
1898 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1899 if (ent->aio_req_resultp == resultp) {
1900 if ((ent->aio_req_flags & AIO_PENDING) == 0) {
1901 mutex_exit(&aiop->aio_mutex);
1902 *rval = AIO_ALLDONE;
1903 return (0);
1904 }
1905 mutex_exit(&aiop->aio_mutex);
1906 *rval = AIO_NOTCANCELED;
1907 return (0);
1908 }
1909 }
1910 mutex_exit(&aiop->aio_mutex);
1911 *rval = AIO_ALLDONE;
1912 return (0);
1913 }
1914
1915 for (index = 0; index < AIO_HASHSZ; index++) {
1916 bucket = &aiop->aio_hash[index];
1917 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1918 if (ent->aio_req_fd == fildes) {
1919 if ((ent->aio_req_flags & AIO_PENDING) != 0) {
1920 mutex_exit(&aiop->aio_mutex);
1921 *rval = AIO_NOTCANCELED;
1922 return (0);
1923 }
1924 }
1925 }
1926 }
1927 mutex_exit(&aiop->aio_mutex);
1928 *rval = AIO_ALLDONE;
1929 return (0);
1930 }
1931
1932 /*
1933 * solaris version of asynchronous read and write
1934 */
1935 static int
arw(int opcode,int fdes,char * bufp,int bufsize,offset_t offset,aio_result_t * resultp,int mode)1936 arw(
1937 int opcode,
1938 int fdes,
1939 char *bufp,
1940 int bufsize,
1941 offset_t offset,
1942 aio_result_t *resultp,
1943 int mode)
1944 {
1945 file_t *fp;
1946 int error;
1947 struct vnode *vp;
1948 aio_req_t *reqp;
1949 aio_t *aiop;
1950 int (*aio_func)();
1951 #ifdef _LP64
1952 aiocb_t aiocb;
1953 #else
1954 aiocb64_32_t aiocb64;
1955 #endif
1956
1957 aiop = curproc->p_aio;
1958 if (aiop == NULL)
1959 return (EINVAL);
1960
1961 if ((fp = getf(fdes)) == NULL) {
1962 return (EBADF);
1963 }
1964
1965 /*
1966 * check the permission of the partition
1967 */
1968 if ((fp->f_flag & mode) == 0) {
1969 releasef(fdes);
1970 return (EBADF);
1971 }
1972
1973 vp = fp->f_vnode;
1974 aio_func = check_vp(vp, mode);
1975 if (aio_func == NULL) {
1976 releasef(fdes);
1977 return (EBADFD);
1978 }
1979 #ifdef _LP64
1980 aiocb.aio_fildes = fdes;
1981 aiocb.aio_buf = bufp;
1982 aiocb.aio_nbytes = bufsize;
1983 aiocb.aio_offset = offset;
1984 aiocb.aio_sigevent.sigev_notify = 0;
1985 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1);
1986 #else
1987 aiocb64.aio_fildes = fdes;
1988 aiocb64.aio_buf = (caddr32_t)bufp;
1989 aiocb64.aio_nbytes = bufsize;
1990 aiocb64.aio_offset = offset;
1991 aiocb64.aio_sigevent.sigev_notify = 0;
1992 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1);
1993 #endif
1994 if (error) {
1995 releasef(fdes);
1996 return (error);
1997 }
1998
1999 /*
2000 * enable polling on this request if the opcode has
2001 * the AIO poll bit set
2002 */
2003 if (opcode & AIO_POLL_BIT)
2004 reqp->aio_req_flags |= AIO_POLL;
2005
2006 if (bufsize == 0) {
2007 clear_active_fd(fdes);
2008 aio_zerolen(reqp);
2009 return (0);
2010 }
2011 /*
2012 * send the request to driver.
2013 */
2014 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2015 /*
2016 * the fd is stored in the aio_req_t by aio_req_setup(), and
2017 * is released by the aio_cleanup_thread() when the IO has
2018 * completed.
2019 */
2020 if (error) {
2021 releasef(fdes);
2022 mutex_enter(&aiop->aio_mutex);
2023 aio_req_free(aiop, reqp);
2024 aiop->aio_pending--;
2025 if (aiop->aio_flags & AIO_REQ_BLOCK)
2026 cv_signal(&aiop->aio_cleanupcv);
2027 mutex_exit(&aiop->aio_mutex);
2028 return (error);
2029 }
2030 clear_active_fd(fdes);
2031 return (0);
2032 }
2033
2034 /*
2035 * posix version of asynchronous read and write
2036 */
2037 static int
aiorw(int opcode,void * aiocb_arg,int mode,int run_mode)2038 aiorw(
2039 int opcode,
2040 void *aiocb_arg,
2041 int mode,
2042 int run_mode)
2043 {
2044 #ifdef _SYSCALL32_IMPL
2045 aiocb32_t aiocb32;
2046 struct sigevent32 *sigev32;
2047 port_notify32_t pntfy32;
2048 #endif
2049 aiocb64_32_t aiocb64;
2050 aiocb_t aiocb;
2051 file_t *fp;
2052 int error, fd;
2053 size_t bufsize;
2054 struct vnode *vp;
2055 aio_req_t *reqp;
2056 aio_t *aiop;
2057 int (*aio_func)();
2058 aio_result_t *resultp;
2059 struct sigevent *sigev;
2060 model_t model;
2061 int aio_use_port = 0;
2062 port_notify_t pntfy;
2063
2064 model = get_udatamodel();
2065 aiop = curproc->p_aio;
2066 if (aiop == NULL)
2067 return (EINVAL);
2068
2069 if (model == DATAMODEL_NATIVE) {
2070 if (run_mode != AIO_LARGEFILE) {
2071 if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
2072 return (EFAULT);
2073 bufsize = aiocb.aio_nbytes;
2074 resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
2075 if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
2076 return (EBADF);
2077 }
2078 sigev = &aiocb.aio_sigevent;
2079 } else {
2080 /*
2081 * We come here only when we make largefile
2082 * call on 32 bit kernel using 32 bit library.
2083 */
2084 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2085 return (EFAULT);
2086 bufsize = aiocb64.aio_nbytes;
2087 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2088 ->aio_resultp);
2089 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2090 return (EBADF);
2091 sigev = (struct sigevent *)&aiocb64.aio_sigevent;
2092 }
2093
2094 if (sigev->sigev_notify == SIGEV_PORT) {
2095 if (copyin((void *)sigev->sigev_value.sival_ptr,
2096 &pntfy, sizeof (port_notify_t))) {
2097 releasef(fd);
2098 return (EFAULT);
2099 }
2100 aio_use_port = 1;
2101 } else if (sigev->sigev_notify == SIGEV_THREAD) {
2102 pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo;
2103 pntfy.portnfy_user =
2104 aiocb.aio_sigevent.sigev_value.sival_ptr;
2105 aio_use_port = 1;
2106 }
2107 }
2108 #ifdef _SYSCALL32_IMPL
2109 else {
2110 if (run_mode == AIO_32) {
2111 /* 32 bit system call is being made on 64 bit kernel */
2112 if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
2113 return (EFAULT);
2114
2115 bufsize = aiocb32.aio_nbytes;
2116 aiocb_32ton(&aiocb32, &aiocb);
2117 resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
2118 aio_resultp);
2119 if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
2120 return (EBADF);
2121 }
2122 sigev32 = &aiocb32.aio_sigevent;
2123 } else if (run_mode == AIO_LARGEFILE) {
2124 /*
2125 * We come here only when we make largefile
2126 * call on 64 bit kernel using 32 bit library.
2127 */
2128 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2129 return (EFAULT);
2130 bufsize = aiocb64.aio_nbytes;
2131 aiocb_LFton(&aiocb64, &aiocb);
2132 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2133 ->aio_resultp);
2134 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2135 return (EBADF);
2136 sigev32 = &aiocb64.aio_sigevent;
2137 }
2138
2139 if (sigev32->sigev_notify == SIGEV_PORT) {
2140 if (copyin(
2141 (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
2142 &pntfy32, sizeof (port_notify32_t))) {
2143 releasef(fd);
2144 return (EFAULT);
2145 }
2146 pntfy.portnfy_port = pntfy32.portnfy_port;
2147 pntfy.portnfy_user = (void *)(uintptr_t)
2148 pntfy32.portnfy_user;
2149 aio_use_port = 1;
2150 } else if (sigev32->sigev_notify == SIGEV_THREAD) {
2151 pntfy.portnfy_port = sigev32->sigev_signo;
2152 pntfy.portnfy_user = (void *)(uintptr_t)
2153 sigev32->sigev_value.sival_ptr;
2154 aio_use_port = 1;
2155 }
2156 }
2157 #endif /* _SYSCALL32_IMPL */
2158
2159 /*
2160 * check the permission of the partition
2161 */
2162
2163 if ((fp->f_flag & mode) == 0) {
2164 releasef(fd);
2165 return (EBADF);
2166 }
2167
2168 vp = fp->f_vnode;
2169 aio_func = check_vp(vp, mode);
2170 if (aio_func == NULL) {
2171 releasef(fd);
2172 return (EBADFD);
2173 }
2174 if (run_mode == AIO_LARGEFILE)
2175 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0);
2176 else
2177 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0);
2178
2179 if (error) {
2180 releasef(fd);
2181 return (error);
2182 }
2183 /*
2184 * enable polling on this request if the opcode has
2185 * the AIO poll bit set
2186 */
2187 if (opcode & AIO_POLL_BIT)
2188 reqp->aio_req_flags |= AIO_POLL;
2189
2190 if (model == DATAMODEL_NATIVE)
2191 reqp->aio_req_iocb.iocb = aiocb_arg;
2192 #ifdef _SYSCALL32_IMPL
2193 else
2194 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
2195 #endif
2196
2197 if (aio_use_port) {
2198 int event = (run_mode == AIO_LARGEFILE)?
2199 ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) :
2200 ((mode == FREAD)? AIOAREAD : AIOAWRITE);
2201 error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event);
2202 }
2203
2204 /*
2205 * send the request to driver.
2206 */
2207 if (error == 0) {
2208 if (bufsize == 0) {
2209 clear_active_fd(fd);
2210 aio_zerolen(reqp);
2211 return (0);
2212 }
2213 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2214 }
2215
2216 /*
2217 * the fd is stored in the aio_req_t by aio_req_setup(), and
2218 * is released by the aio_cleanup_thread() when the IO has
2219 * completed.
2220 */
2221 if (error) {
2222 releasef(fd);
2223 mutex_enter(&aiop->aio_mutex);
2224 if (aio_use_port)
2225 aio_deq(&aiop->aio_portpending, reqp);
2226 aio_req_free(aiop, reqp);
2227 aiop->aio_pending--;
2228 if (aiop->aio_flags & AIO_REQ_BLOCK)
2229 cv_signal(&aiop->aio_cleanupcv);
2230 mutex_exit(&aiop->aio_mutex);
2231 return (error);
2232 }
2233 clear_active_fd(fd);
2234 return (0);
2235 }
2236
2237
2238 /*
2239 * set error for a list IO entry that failed.
2240 */
2241 static void
lio_set_error(aio_req_t * reqp,int portused)2242 lio_set_error(aio_req_t *reqp, int portused)
2243 {
2244 aio_t *aiop = curproc->p_aio;
2245
2246 if (aiop == NULL)
2247 return;
2248
2249 mutex_enter(&aiop->aio_mutex);
2250 if (portused)
2251 aio_deq(&aiop->aio_portpending, reqp);
2252 aiop->aio_pending--;
2253 /* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
2254 reqp->aio_req_flags |= AIO_PHYSIODONE;
2255 /*
2256 * Need to free the request now as its never
2257 * going to get on the done queue
2258 *
2259 * Note: aio_outstanding is decremented in
2260 * aio_req_free()
2261 */
2262 aio_req_free(aiop, reqp);
2263 if (aiop->aio_flags & AIO_REQ_BLOCK)
2264 cv_signal(&aiop->aio_cleanupcv);
2265 mutex_exit(&aiop->aio_mutex);
2266 }
2267
2268 /*
2269 * check if a specified request is done, and remove it from
2270 * the done queue. otherwise remove anybody from the done queue
2271 * if NULL is specified.
2272 */
2273 static aio_req_t *
aio_req_done(void * resultp)2274 aio_req_done(void *resultp)
2275 {
2276 aio_req_t **bucket;
2277 aio_req_t *ent;
2278 aio_t *aiop = curproc->p_aio;
2279 long index;
2280
2281 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2282 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2283
2284 if (resultp) {
2285 index = AIO_HASH(resultp);
2286 bucket = &aiop->aio_hash[index];
2287 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2288 if (ent->aio_req_resultp == (aio_result_t *)resultp) {
2289 if (ent->aio_req_flags & AIO_DONEQ) {
2290 return (aio_req_remove(ent));
2291 }
2292 return (NULL);
2293 }
2294 }
2295 /* no match, resultp is invalid */
2296 return (NULL);
2297 }
2298 return (aio_req_remove(NULL));
2299 }
2300
2301 /*
2302 * determine if a user-level resultp pointer is associated with an
2303 * active IO request. Zero is returned when the request is done,
2304 * and the request is removed from the done queue. Only when the
2305 * return value is zero, is the "reqp" pointer valid. One is returned
2306 * when the request is inprogress. Two is returned when the request
2307 * is invalid.
2308 */
2309 static int
aio_req_find(aio_result_t * resultp,aio_req_t ** reqp)2310 aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
2311 {
2312 aio_req_t **bucket;
2313 aio_req_t *ent;
2314 aio_t *aiop = curproc->p_aio;
2315 long index;
2316
2317 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2318 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2319
2320 index = AIO_HASH(resultp);
2321 bucket = &aiop->aio_hash[index];
2322 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2323 if (ent->aio_req_resultp == resultp) {
2324 if (ent->aio_req_flags & AIO_DONEQ) {
2325 *reqp = aio_req_remove(ent);
2326 return (0);
2327 }
2328 return (1);
2329 }
2330 }
2331 /* no match, resultp is invalid */
2332 return (2);
2333 }
2334
2335 /*
2336 * remove a request from the done queue.
2337 */
2338 static aio_req_t *
aio_req_remove(aio_req_t * reqp)2339 aio_req_remove(aio_req_t *reqp)
2340 {
2341 aio_t *aiop = curproc->p_aio;
2342
2343 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2344
2345 if (reqp != NULL) {
2346 ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2347 if (reqp->aio_req_next == reqp) {
2348 /* only one request on queue */
2349 if (reqp == aiop->aio_doneq) {
2350 aiop->aio_doneq = NULL;
2351 } else {
2352 ASSERT(reqp == aiop->aio_cleanupq);
2353 aiop->aio_cleanupq = NULL;
2354 }
2355 } else {
2356 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2357 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2358 /*
2359 * The request can be either on the aio_doneq or the
2360 * aio_cleanupq
2361 */
2362 if (reqp == aiop->aio_doneq)
2363 aiop->aio_doneq = reqp->aio_req_next;
2364
2365 if (reqp == aiop->aio_cleanupq)
2366 aiop->aio_cleanupq = reqp->aio_req_next;
2367 }
2368 reqp->aio_req_flags &= ~AIO_DONEQ;
2369 reqp->aio_req_next = NULL;
2370 reqp->aio_req_prev = NULL;
2371 } else if ((reqp = aiop->aio_doneq) != NULL) {
2372 ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2373 if (reqp == reqp->aio_req_next) {
2374 /* only one request on queue */
2375 aiop->aio_doneq = NULL;
2376 } else {
2377 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2378 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2379 aiop->aio_doneq = reqp->aio_req_next;
2380 }
2381 reqp->aio_req_flags &= ~AIO_DONEQ;
2382 reqp->aio_req_next = NULL;
2383 reqp->aio_req_prev = NULL;
2384 }
2385 if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN))
2386 cv_broadcast(&aiop->aio_waitcv);
2387 return (reqp);
2388 }
2389
2390 static int
aio_req_setup(aio_req_t ** reqpp,aio_t * aiop,aiocb_t * arg,aio_result_t * resultp,vnode_t * vp,int old_solaris_req)2391 aio_req_setup(
2392 aio_req_t **reqpp,
2393 aio_t *aiop,
2394 aiocb_t *arg,
2395 aio_result_t *resultp,
2396 vnode_t *vp,
2397 int old_solaris_req)
2398 {
2399 sigqueue_t *sqp = NULL;
2400 aio_req_t *reqp;
2401 struct uio *uio;
2402 struct sigevent *sigev;
2403 int error;
2404
2405 sigev = &arg->aio_sigevent;
2406 if (sigev->sigev_notify == SIGEV_SIGNAL &&
2407 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
2408 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2409 if (sqp == NULL)
2410 return (EAGAIN);
2411 sqp->sq_func = NULL;
2412 sqp->sq_next = NULL;
2413 sqp->sq_info.si_code = SI_ASYNCIO;
2414 sqp->sq_info.si_pid = curproc->p_pid;
2415 sqp->sq_info.si_ctid = PRCTID(curproc);
2416 sqp->sq_info.si_zoneid = getzoneid();
2417 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2418 sqp->sq_info.si_signo = sigev->sigev_signo;
2419 sqp->sq_info.si_value = sigev->sigev_value;
2420 }
2421
2422 mutex_enter(&aiop->aio_mutex);
2423
2424 if (aiop->aio_flags & AIO_REQ_BLOCK) {
2425 mutex_exit(&aiop->aio_mutex);
2426 if (sqp)
2427 kmem_free(sqp, sizeof (sigqueue_t));
2428 return (EIO);
2429 }
2430 /*
2431 * get an aio_reqp from the free list or allocate one
2432 * from dynamic memory.
2433 */
2434 if (error = aio_req_alloc(&reqp, resultp)) {
2435 mutex_exit(&aiop->aio_mutex);
2436 if (sqp)
2437 kmem_free(sqp, sizeof (sigqueue_t));
2438 return (error);
2439 }
2440 aiop->aio_pending++;
2441 aiop->aio_outstanding++;
2442 reqp->aio_req_flags = AIO_PENDING;
2443 if (old_solaris_req) {
2444 /* this is an old solaris aio request */
2445 reqp->aio_req_flags |= AIO_SOLARIS;
2446 aiop->aio_flags |= AIO_SOLARIS_REQ;
2447 }
2448 if (sigev->sigev_notify == SIGEV_THREAD ||
2449 sigev->sigev_notify == SIGEV_PORT)
2450 aio_enq(&aiop->aio_portpending, reqp, 0);
2451 mutex_exit(&aiop->aio_mutex);
2452 /*
2453 * initialize aio request.
2454 */
2455 reqp->aio_req_fd = arg->aio_fildes;
2456 reqp->aio_req_sigqp = sqp;
2457 reqp->aio_req_iocb.iocb = NULL;
2458 reqp->aio_req_lio = NULL;
2459 reqp->aio_req_buf.b_file = vp;
2460 uio = reqp->aio_req.aio_uio;
2461 uio->uio_iovcnt = 1;
2462 uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
2463 uio->uio_iov->iov_len = arg->aio_nbytes;
2464 uio->uio_loffset = arg->aio_offset;
2465 *reqpp = reqp;
2466 return (0);
2467 }
2468
2469 /*
2470 * Allocate p_aio struct.
2471 */
2472 static aio_t *
aio_aiop_alloc(void)2473 aio_aiop_alloc(void)
2474 {
2475 aio_t *aiop;
2476
2477 ASSERT(MUTEX_HELD(&curproc->p_lock));
2478
2479 aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
2480 if (aiop) {
2481 mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
2482 mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
2483 NULL);
2484 mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
2485 }
2486 return (aiop);
2487 }
2488
2489 /*
2490 * Allocate an aio_req struct.
2491 */
2492 static int
aio_req_alloc(aio_req_t ** nreqp,aio_result_t * resultp)2493 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
2494 {
2495 aio_req_t *reqp;
2496 aio_t *aiop = curproc->p_aio;
2497
2498 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2499
2500 if ((reqp = aiop->aio_free) != NULL) {
2501 aiop->aio_free = reqp->aio_req_next;
2502 bzero(reqp, sizeof (*reqp));
2503 } else {
2504 /*
2505 * Check whether memory is getting tight.
2506 * This is a temporary mechanism to avoid memory
2507 * exhaustion by a single process until we come up
2508 * with a per process solution such as setrlimit().
2509 */
2510 if (freemem < desfree)
2511 return (EAGAIN);
2512 reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
2513 if (reqp == NULL)
2514 return (EAGAIN);
2515 }
2516 reqp->aio_req.aio_uio = &reqp->aio_req_uio;
2517 reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov;
2518 reqp->aio_req.aio_private = reqp;
2519 reqp->aio_req_buf.b_offset = -1;
2520 reqp->aio_req_resultp = resultp;
2521 if (aio_hash_insert(reqp, aiop)) {
2522 reqp->aio_req_next = aiop->aio_free;
2523 aiop->aio_free = reqp;
2524 return (EBUSY);
2525 }
2526 *nreqp = reqp;
2527 return (0);
2528 }
2529
2530 /*
2531 * Allocate an aio_lio_t struct.
2532 */
2533 static int
aio_lio_alloc(aio_lio_t ** head)2534 aio_lio_alloc(aio_lio_t **head)
2535 {
2536 aio_lio_t *liop;
2537 aio_t *aiop = curproc->p_aio;
2538
2539 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2540
2541 if ((liop = aiop->aio_lio_free) != NULL) {
2542 aiop->aio_lio_free = liop->lio_next;
2543 } else {
2544 /*
2545 * Check whether memory is getting tight.
2546 * This is a temporary mechanism to avoid memory
2547 * exhaustion by a single process until we come up
2548 * with a per process solution such as setrlimit().
2549 */
2550 if (freemem < desfree)
2551 return (EAGAIN);
2552
2553 liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
2554 if (liop == NULL)
2555 return (EAGAIN);
2556 }
2557 *head = liop;
2558 return (0);
2559 }
2560
2561 /*
2562 * this is a special per-process thread that is only activated if
2563 * the process is unmapping a segment with outstanding aio. normally,
2564 * the process will have completed the aio before unmapping the
2565 * segment. If the process does unmap a segment with outstanding aio,
2566 * this special thread will guarentee that the locked pages due to
2567 * aphysio() are released, thereby permitting the segment to be
2568 * unmapped. In addition to this, the cleanup thread is woken up
2569 * during DR operations to release the locked pages.
2570 */
2571
2572 static int
aio_cleanup_thread(aio_t * aiop)2573 aio_cleanup_thread(aio_t *aiop)
2574 {
2575 proc_t *p = curproc;
2576 struct as *as = p->p_as;
2577 int poked = 0;
2578 kcondvar_t *cvp;
2579 int exit_flag = 0;
2580 int rqclnup = 0;
2581
2582 sigfillset(&curthread->t_hold);
2583 sigdiffset(&curthread->t_hold, &cantmask);
2584 for (;;) {
2585 /*
2586 * if a segment is being unmapped, and the current
2587 * process's done queue is not empty, then every request
2588 * on the doneq with locked resources should be forced
2589 * to release their locks. By moving the doneq request
2590 * to the cleanupq, aio_cleanup() will process the cleanupq,
2591 * and place requests back onto the doneq. All requests
2592 * processed by aio_cleanup() will have their physical
2593 * resources unlocked.
2594 */
2595 mutex_enter(&aiop->aio_mutex);
2596 if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
2597 aiop->aio_flags |= AIO_CLEANUP;
2598 mutex_enter(&as->a_contents);
2599 if (aiop->aio_rqclnup) {
2600 aiop->aio_rqclnup = 0;
2601 rqclnup = 1;
2602 }
2603 mutex_exit(&as->a_contents);
2604 if (aiop->aio_doneq) {
2605 aio_req_t *doneqhead = aiop->aio_doneq;
2606 aiop->aio_doneq = NULL;
2607 aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
2608 }
2609 }
2610 mutex_exit(&aiop->aio_mutex);
2611 aio_cleanup(AIO_CLEANUP_THREAD);
2612 /*
2613 * thread should block on the cleanupcv while
2614 * AIO_CLEANUP is set.
2615 */
2616 cvp = &aiop->aio_cleanupcv;
2617 mutex_enter(&aiop->aio_mutex);
2618
2619 if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
2620 aiop->aio_notifyq != NULL ||
2621 aiop->aio_portcleanupq != NULL) {
2622 mutex_exit(&aiop->aio_mutex);
2623 continue;
2624 }
2625 mutex_enter(&as->a_contents);
2626
2627 /*
2628 * AIO_CLEANUP determines when the cleanup thread
2629 * should be active. This flag is set when
2630 * the cleanup thread is awakened by as_unmap() or
2631 * due to DR operations.
2632 * The flag is cleared when the blocking as_unmap()
2633 * that originally awakened us is allowed to
2634 * complete. as_unmap() blocks when trying to
2635 * unmap a segment that has SOFTLOCKed pages. when
2636 * the segment's pages are all SOFTUNLOCKed,
2637 * as->a_flags & AS_UNMAPWAIT should be zero.
2638 *
2639 * In case of cleanup request by DR, the flag is cleared
2640 * once all the pending aio requests have been processed.
2641 *
2642 * The flag shouldn't be cleared right away if the
2643 * cleanup thread was interrupted because the process
2644 * is doing forkall(). This happens when cv_wait_sig()
2645 * returns zero, because it was awakened by a pokelwps().
2646 * If the process is not exiting, it must be doing forkall().
2647 */
2648 if ((poked == 0) &&
2649 ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) ||
2650 (aiop->aio_pending == 0))) {
2651 aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
2652 cvp = &as->a_cv;
2653 rqclnup = 0;
2654 }
2655 mutex_exit(&aiop->aio_mutex);
2656 if (poked) {
2657 /*
2658 * If the process is exiting/killed, don't return
2659 * immediately without waiting for pending I/O's
2660 * and releasing the page locks.
2661 */
2662 if (p->p_flag & (SEXITLWPS|SKILLED)) {
2663 /*
2664 * If exit_flag is set, then it is
2665 * safe to exit because we have released
2666 * page locks of completed I/O's.
2667 */
2668 if (exit_flag)
2669 break;
2670
2671 mutex_exit(&as->a_contents);
2672
2673 /*
2674 * Wait for all the pending aio to complete.
2675 */
2676 mutex_enter(&aiop->aio_mutex);
2677 aiop->aio_flags |= AIO_REQ_BLOCK;
2678 while (aiop->aio_pending != 0)
2679 cv_wait(&aiop->aio_cleanupcv,
2680 &aiop->aio_mutex);
2681 mutex_exit(&aiop->aio_mutex);
2682 exit_flag = 1;
2683 continue;
2684 } else if (p->p_flag &
2685 (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
2686 /*
2687 * hold LWP until it
2688 * is continued.
2689 */
2690 mutex_exit(&as->a_contents);
2691 mutex_enter(&p->p_lock);
2692 stop(PR_SUSPENDED, SUSPEND_NORMAL);
2693 mutex_exit(&p->p_lock);
2694 poked = 0;
2695 continue;
2696 }
2697 } else {
2698 /*
2699 * When started this thread will sleep on as->a_cv.
2700 * as_unmap will awake this thread if the
2701 * segment has SOFTLOCKed pages (poked = 0).
2702 * 1. pokelwps() awakes this thread =>
2703 * break the loop to check SEXITLWPS, SHOLDFORK, etc
2704 * 2. as_unmap awakes this thread =>
2705 * to break the loop it is necessary that
2706 * - AS_UNMAPWAIT is set (as_unmap is waiting for
2707 * memory to be unlocked)
2708 * - AIO_CLEANUP is not set
2709 * (if AIO_CLEANUP is set we have to wait for
2710 * pending requests. aio_done will send a signal
2711 * for every request which completes to continue
2712 * unmapping the corresponding address range)
2713 * 3. A cleanup request will wake this thread up, ex.
2714 * by the DR operations. The aio_rqclnup flag will
2715 * be set.
2716 */
2717 while (poked == 0) {
2718 /*
2719 * The clean up requests that came in
2720 * after we had just cleaned up, couldn't
2721 * be causing the unmap thread to block - as
2722 * unmap event happened first.
2723 * Let aio_done() wake us up if it sees a need.
2724 */
2725 if (aiop->aio_rqclnup &&
2726 (aiop->aio_flags & AIO_CLEANUP) == 0)
2727 break;
2728 poked = !cv_wait_sig(cvp, &as->a_contents);
2729 if (AS_ISUNMAPWAIT(as) == 0)
2730 cv_signal(cvp);
2731 if (aiop->aio_outstanding != 0)
2732 break;
2733 }
2734 }
2735 mutex_exit(&as->a_contents);
2736 }
2737 exit:
2738 mutex_exit(&as->a_contents);
2739 ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
2740 aston(curthread); /* make thread do post_syscall */
2741 return (0);
2742 }
2743
2744 /*
2745 * save a reference to a user's outstanding aio in a hash list.
2746 */
2747 static int
aio_hash_insert(aio_req_t * aio_reqp,aio_t * aiop)2748 aio_hash_insert(
2749 aio_req_t *aio_reqp,
2750 aio_t *aiop)
2751 {
2752 long index;
2753 aio_result_t *resultp = aio_reqp->aio_req_resultp;
2754 aio_req_t *current;
2755 aio_req_t **nextp;
2756
2757 index = AIO_HASH(resultp);
2758 nextp = &aiop->aio_hash[index];
2759 while ((current = *nextp) != NULL) {
2760 if (current->aio_req_resultp == resultp)
2761 return (DUPLICATE);
2762 nextp = ¤t->aio_hash_next;
2763 }
2764 *nextp = aio_reqp;
2765 aio_reqp->aio_hash_next = NULL;
2766 return (0);
2767 }
2768
2769 static int
check_vp(struct vnode * vp,int mode)2770 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
2771 cred_t *)
2772 {
2773 struct snode *sp;
2774 dev_t dev;
2775 struct cb_ops *cb;
2776 major_t major;
2777 int (*aio_func)();
2778
2779 dev = vp->v_rdev;
2780 major = getmajor(dev);
2781
2782 /*
2783 * return NULL for requests to files and STREAMs so
2784 * that libaio takes care of them.
2785 */
2786 if (vp->v_type == VCHR) {
2787 /* no stream device for kaio */
2788 if (STREAMSTAB(major)) {
2789 return (NULL);
2790 }
2791 } else {
2792 return (NULL);
2793 }
2794
2795 /*
2796 * Check old drivers which do not have async I/O entry points.
2797 */
2798 if (devopsp[major]->devo_rev < 3)
2799 return (NULL);
2800
2801 cb = devopsp[major]->devo_cb_ops;
2802
2803 if (cb->cb_rev < 1)
2804 return (NULL);
2805
2806 /*
2807 * Check whether this device is a block device.
2808 * Kaio is not supported for devices like tty.
2809 */
2810 if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
2811 return (NULL);
2812
2813 /*
2814 * Clustering: If vnode is a PXFS vnode, then the device may be remote.
2815 * We cannot call the driver directly. Instead return the
2816 * PXFS functions.
2817 */
2818
2819 if (IS_PXFSVP(vp)) {
2820 if (mode & FREAD)
2821 return (clpxfs_aio_read);
2822 else
2823 return (clpxfs_aio_write);
2824 }
2825 if (mode & FREAD)
2826 aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
2827 else
2828 aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;
2829
2830 /*
2831 * Do we need this ?
2832 * nodev returns ENXIO anyway.
2833 */
2834 if (aio_func == nodev)
2835 return (NULL);
2836
2837 sp = VTOS(vp);
2838 smark(sp, SACC);
2839 return (aio_func);
2840 }
2841
2842 /*
2843 * Clustering: We want check_vp to return a function prototyped
2844 * correctly that will be common to both PXFS and regular case.
2845 * We define this intermediate function that will do the right
2846 * thing for driver cases.
2847 */
2848
2849 static int
driver_aio_write(vnode_t * vp,struct aio_req * aio,cred_t * cred_p)2850 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2851 {
2852 dev_t dev;
2853 struct cb_ops *cb;
2854
2855 ASSERT(vp->v_type == VCHR);
2856 ASSERT(!IS_PXFSVP(vp));
2857 dev = VTOS(vp)->s_dev;
2858 ASSERT(STREAMSTAB(getmajor(dev)) == NULL);
2859
2860 cb = devopsp[getmajor(dev)]->devo_cb_ops;
2861
2862 ASSERT(cb->cb_awrite != nodev);
2863 return ((*cb->cb_awrite)(dev, aio, cred_p));
2864 }
2865
2866 /*
2867 * Clustering: We want check_vp to return a function prototyped
2868 * correctly that will be common to both PXFS and regular case.
2869 * We define this intermediate function that will do the right
2870 * thing for driver cases.
2871 */
2872
2873 static int
driver_aio_read(vnode_t * vp,struct aio_req * aio,cred_t * cred_p)2874 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2875 {
2876 dev_t dev;
2877 struct cb_ops *cb;
2878
2879 ASSERT(vp->v_type == VCHR);
2880 ASSERT(!IS_PXFSVP(vp));
2881 dev = VTOS(vp)->s_dev;
2882 ASSERT(!STREAMSTAB(getmajor(dev)));
2883
2884 cb = devopsp[getmajor(dev)]->devo_cb_ops;
2885
2886 ASSERT(cb->cb_aread != nodev);
2887 return ((*cb->cb_aread)(dev, aio, cred_p));
2888 }
2889
2890 /*
2891 * This routine is called when a largefile call is made by a 32bit
2892 * process on a ILP32 or LP64 kernel. All 64bit processes are large
2893 * file by definition and will call alio() instead.
2894 */
2895 static int
alioLF(int mode_arg,void * aiocb_arg,int nent,void * sigev)2896 alioLF(
2897 int mode_arg,
2898 void *aiocb_arg,
2899 int nent,
2900 void *sigev)
2901 {
2902 file_t *fp;
2903 file_t *prev_fp = NULL;
2904 int prev_mode = -1;
2905 struct vnode *vp;
2906 aio_lio_t *head;
2907 aio_req_t *reqp;
2908 aio_t *aiop;
2909 caddr_t cbplist;
2910 aiocb64_32_t cb64;
2911 aiocb64_32_t *aiocb = &cb64;
2912 aiocb64_32_t *cbp;
2913 caddr32_t *ucbp;
2914 #ifdef _LP64
2915 aiocb_t aiocb_n;
2916 #endif
2917 struct sigevent32 sigevk;
2918 sigqueue_t *sqp;
2919 int (*aio_func)();
2920 int mode;
2921 int error = 0;
2922 int aio_errors = 0;
2923 int i;
2924 size_t ssize;
2925 int deadhead = 0;
2926 int aio_notsupported = 0;
2927 int lio_head_port;
2928 int aio_port;
2929 int aio_thread;
2930 port_kevent_t *pkevtp = NULL;
2931 int portused = 0;
2932 port_notify32_t pnotify;
2933 int event;
2934
2935 aiop = curproc->p_aio;
2936 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
2937 return (EINVAL);
2938
2939 ASSERT(get_udatamodel() == DATAMODEL_ILP32);
2940
2941 ssize = (sizeof (caddr32_t) * nent);
2942 cbplist = kmem_alloc(ssize, KM_SLEEP);
2943 ucbp = (caddr32_t *)cbplist;
2944
2945 if (copyin(aiocb_arg, cbplist, ssize) ||
2946 (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) {
2947 kmem_free(cbplist, ssize);
2948 return (EFAULT);
2949 }
2950
2951 /* Event Ports */
2952 if (sigev &&
2953 (sigevk.sigev_notify == SIGEV_THREAD ||
2954 sigevk.sigev_notify == SIGEV_PORT)) {
2955 if (sigevk.sigev_notify == SIGEV_THREAD) {
2956 pnotify.portnfy_port = sigevk.sigev_signo;
2957 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
2958 } else if (copyin(
2959 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
2960 &pnotify, sizeof (pnotify))) {
2961 kmem_free(cbplist, ssize);
2962 return (EFAULT);
2963 }
2964 error = port_alloc_event(pnotify.portnfy_port,
2965 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
2966 if (error) {
2967 if (error == ENOMEM || error == EAGAIN)
2968 error = EAGAIN;
2969 else
2970 error = EINVAL;
2971 kmem_free(cbplist, ssize);
2972 return (error);
2973 }
2974 lio_head_port = pnotify.portnfy_port;
2975 portused = 1;
2976 }
2977
2978 /*
2979 * a list head should be allocated if notification is
2980 * enabled for this list.
2981 */
2982 head = NULL;
2983
2984 if (mode_arg == LIO_WAIT || sigev) {
2985 mutex_enter(&aiop->aio_mutex);
2986 error = aio_lio_alloc(&head);
2987 mutex_exit(&aiop->aio_mutex);
2988 if (error)
2989 goto done;
2990 deadhead = 1;
2991 head->lio_nent = nent;
2992 head->lio_refcnt = nent;
2993 head->lio_port = -1;
2994 head->lio_portkev = NULL;
2995 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
2996 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
2997 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2998 if (sqp == NULL) {
2999 error = EAGAIN;
3000 goto done;
3001 }
3002 sqp->sq_func = NULL;
3003 sqp->sq_next = NULL;
3004 sqp->sq_info.si_code = SI_ASYNCIO;
3005 sqp->sq_info.si_pid = curproc->p_pid;
3006 sqp->sq_info.si_ctid = PRCTID(curproc);
3007 sqp->sq_info.si_zoneid = getzoneid();
3008 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3009 sqp->sq_info.si_signo = sigevk.sigev_signo;
3010 sqp->sq_info.si_value.sival_int =
3011 sigevk.sigev_value.sival_int;
3012 head->lio_sigqp = sqp;
3013 } else {
3014 head->lio_sigqp = NULL;
3015 }
3016 if (pkevtp) {
3017 /*
3018 * Prepare data to send when list of aiocb's
3019 * has completed.
3020 */
3021 port_init_event(pkevtp, (uintptr_t)sigev,
3022 (void *)(uintptr_t)pnotify.portnfy_user,
3023 NULL, head);
3024 pkevtp->portkev_events = AIOLIO64;
3025 head->lio_portkev = pkevtp;
3026 head->lio_port = pnotify.portnfy_port;
3027 }
3028 }
3029
3030 for (i = 0; i < nent; i++, ucbp++) {
3031
3032 cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
3033 /* skip entry if it can't be copied. */
3034 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
3035 if (head) {
3036 mutex_enter(&aiop->aio_mutex);
3037 head->lio_nent--;
3038 head->lio_refcnt--;
3039 mutex_exit(&aiop->aio_mutex);
3040 }
3041 continue;
3042 }
3043
3044 /* skip if opcode for aiocb is LIO_NOP */
3045 mode = aiocb->aio_lio_opcode;
3046 if (mode == LIO_NOP) {
3047 cbp = NULL;
3048 if (head) {
3049 mutex_enter(&aiop->aio_mutex);
3050 head->lio_nent--;
3051 head->lio_refcnt--;
3052 mutex_exit(&aiop->aio_mutex);
3053 }
3054 continue;
3055 }
3056
3057 /* increment file descriptor's ref count. */
3058 if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3059 lio_set_uerror(&cbp->aio_resultp, EBADF);
3060 if (head) {
3061 mutex_enter(&aiop->aio_mutex);
3062 head->lio_nent--;
3063 head->lio_refcnt--;
3064 mutex_exit(&aiop->aio_mutex);
3065 }
3066 aio_errors++;
3067 continue;
3068 }
3069
3070 /*
3071 * check the permission of the partition
3072 */
3073 if ((fp->f_flag & mode) == 0) {
3074 releasef(aiocb->aio_fildes);
3075 lio_set_uerror(&cbp->aio_resultp, EBADF);
3076 if (head) {
3077 mutex_enter(&aiop->aio_mutex);
3078 head->lio_nent--;
3079 head->lio_refcnt--;
3080 mutex_exit(&aiop->aio_mutex);
3081 }
3082 aio_errors++;
3083 continue;
3084 }
3085
3086 /*
3087 * common case where requests are to the same fd
3088 * for the same r/w operation
3089 * for UFS, need to set EBADFD
3090 */
3091 vp = fp->f_vnode;
3092 if (fp != prev_fp || mode != prev_mode) {
3093 aio_func = check_vp(vp, mode);
3094 if (aio_func == NULL) {
3095 prev_fp = NULL;
3096 releasef(aiocb->aio_fildes);
3097 lio_set_uerror(&cbp->aio_resultp, EBADFD);
3098 aio_notsupported++;
3099 if (head) {
3100 mutex_enter(&aiop->aio_mutex);
3101 head->lio_nent--;
3102 head->lio_refcnt--;
3103 mutex_exit(&aiop->aio_mutex);
3104 }
3105 continue;
3106 } else {
3107 prev_fp = fp;
3108 prev_mode = mode;
3109 }
3110 }
3111
3112 #ifdef _LP64
3113 aiocb_LFton(aiocb, &aiocb_n);
3114 error = aio_req_setup(&reqp, aiop, &aiocb_n,
3115 (aio_result_t *)&cbp->aio_resultp, vp, 0);
3116 #else
3117 error = aio_req_setupLF(&reqp, aiop, aiocb,
3118 (aio_result_t *)&cbp->aio_resultp, vp, 0);
3119 #endif /* _LP64 */
3120 if (error) {
3121 releasef(aiocb->aio_fildes);
3122 lio_set_uerror(&cbp->aio_resultp, error);
3123 if (head) {
3124 mutex_enter(&aiop->aio_mutex);
3125 head->lio_nent--;
3126 head->lio_refcnt--;
3127 mutex_exit(&aiop->aio_mutex);
3128 }
3129 aio_errors++;
3130 continue;
3131 }
3132
3133 reqp->aio_req_lio = head;
3134 deadhead = 0;
3135
3136 /*
3137 * Set the errno field now before sending the request to
3138 * the driver to avoid a race condition
3139 */
3140 (void) suword32(&cbp->aio_resultp.aio_errno,
3141 EINPROGRESS);
3142
3143 reqp->aio_req_iocb.iocb32 = *ucbp;
3144
3145 event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64;
3146 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3147 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3148 if (aio_port | aio_thread) {
3149 port_kevent_t *lpkevp;
3150 /*
3151 * Prepare data to send with each aiocb completed.
3152 */
3153 if (aio_port) {
3154 void *paddr = (void *)(uintptr_t)
3155 aiocb->aio_sigevent.sigev_value.sival_ptr;
3156 if (copyin(paddr, &pnotify, sizeof (pnotify)))
3157 error = EFAULT;
3158 } else { /* aio_thread */
3159 pnotify.portnfy_port =
3160 aiocb->aio_sigevent.sigev_signo;
3161 pnotify.portnfy_user =
3162 aiocb->aio_sigevent.sigev_value.sival_ptr;
3163 }
3164 if (error)
3165 /* EMPTY */;
3166 else if (pkevtp != NULL &&
3167 pnotify.portnfy_port == lio_head_port)
3168 error = port_dup_event(pkevtp, &lpkevp,
3169 PORT_ALLOC_DEFAULT);
3170 else
3171 error = port_alloc_event(pnotify.portnfy_port,
3172 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3173 &lpkevp);
3174 if (error == 0) {
3175 port_init_event(lpkevp, (uintptr_t)*ucbp,
3176 (void *)(uintptr_t)pnotify.portnfy_user,
3177 aio_port_callback, reqp);
3178 lpkevp->portkev_events = event;
3179 reqp->aio_req_portkev = lpkevp;
3180 reqp->aio_req_port = pnotify.portnfy_port;
3181 }
3182 }
3183
3184 /*
3185 * send the request to driver.
3186 */
3187 if (error == 0) {
3188 if (aiocb->aio_nbytes == 0) {
3189 clear_active_fd(aiocb->aio_fildes);
3190 aio_zerolen(reqp);
3191 continue;
3192 }
3193 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3194 CRED());
3195 }
3196
3197 /*
3198 * the fd's ref count is not decremented until the IO has
3199 * completed unless there was an error.
3200 */
3201 if (error) {
3202 releasef(aiocb->aio_fildes);
3203 lio_set_uerror(&cbp->aio_resultp, error);
3204 if (head) {
3205 mutex_enter(&aiop->aio_mutex);
3206 head->lio_nent--;
3207 head->lio_refcnt--;
3208 mutex_exit(&aiop->aio_mutex);
3209 }
3210 if (error == ENOTSUP)
3211 aio_notsupported++;
3212 else
3213 aio_errors++;
3214 lio_set_error(reqp, portused);
3215 } else {
3216 clear_active_fd(aiocb->aio_fildes);
3217 }
3218 }
3219
3220 if (aio_notsupported) {
3221 error = ENOTSUP;
3222 } else if (aio_errors) {
3223 /*
3224 * return EIO if any request failed
3225 */
3226 error = EIO;
3227 }
3228
3229 if (mode_arg == LIO_WAIT) {
3230 mutex_enter(&aiop->aio_mutex);
3231 while (head->lio_refcnt > 0) {
3232 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3233 mutex_exit(&aiop->aio_mutex);
3234 error = EINTR;
3235 goto done;
3236 }
3237 }
3238 mutex_exit(&aiop->aio_mutex);
3239 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
3240 }
3241
3242 done:
3243 kmem_free(cbplist, ssize);
3244 if (deadhead) {
3245 if (head->lio_sigqp)
3246 kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3247 if (head->lio_portkev)
3248 port_free_event(head->lio_portkev);
3249 kmem_free(head, sizeof (aio_lio_t));
3250 }
3251 return (error);
3252 }
3253
3254 #ifdef _SYSCALL32_IMPL
3255 static void
aiocb_LFton(aiocb64_32_t * src,aiocb_t * dest)3256 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
3257 {
3258 dest->aio_fildes = src->aio_fildes;
3259 dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
3260 dest->aio_nbytes = (size_t)src->aio_nbytes;
3261 dest->aio_offset = (off_t)src->aio_offset;
3262 dest->aio_reqprio = src->aio_reqprio;
3263 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3264 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3265
3266 /*
3267 * See comment in sigqueue32() on handling of 32-bit
3268 * sigvals in a 64-bit kernel.
3269 */
3270 dest->aio_sigevent.sigev_value.sival_int =
3271 (int)src->aio_sigevent.sigev_value.sival_int;
3272 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3273 (uintptr_t)src->aio_sigevent.sigev_notify_function;
3274 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3275 (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3276 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3277 dest->aio_lio_opcode = src->aio_lio_opcode;
3278 dest->aio_state = src->aio_state;
3279 dest->aio__pad[0] = src->aio__pad[0];
3280 }
3281 #endif
3282
3283 /*
3284 * This function is used only for largefile calls made by
3285 * 32 bit applications.
3286 */
3287 static int
aio_req_setupLF(aio_req_t ** reqpp,aio_t * aiop,aiocb64_32_t * arg,aio_result_t * resultp,vnode_t * vp,int old_solaris_req)3288 aio_req_setupLF(
3289 aio_req_t **reqpp,
3290 aio_t *aiop,
3291 aiocb64_32_t *arg,
3292 aio_result_t *resultp,
3293 vnode_t *vp,
3294 int old_solaris_req)
3295 {
3296 sigqueue_t *sqp = NULL;
3297 aio_req_t *reqp;
3298 struct uio *uio;
3299 struct sigevent32 *sigev;
3300 int error;
3301
3302 sigev = &arg->aio_sigevent;
3303 if (sigev->sigev_notify == SIGEV_SIGNAL &&
3304 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
3305 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3306 if (sqp == NULL)
3307 return (EAGAIN);
3308 sqp->sq_func = NULL;
3309 sqp->sq_next = NULL;
3310 sqp->sq_info.si_code = SI_ASYNCIO;
3311 sqp->sq_info.si_pid = curproc->p_pid;
3312 sqp->sq_info.si_ctid = PRCTID(curproc);
3313 sqp->sq_info.si_zoneid = getzoneid();
3314 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3315 sqp->sq_info.si_signo = sigev->sigev_signo;
3316 sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int;
3317 }
3318
3319 mutex_enter(&aiop->aio_mutex);
3320
3321 if (aiop->aio_flags & AIO_REQ_BLOCK) {
3322 mutex_exit(&aiop->aio_mutex);
3323 if (sqp)
3324 kmem_free(sqp, sizeof (sigqueue_t));
3325 return (EIO);
3326 }
3327 /*
3328 * get an aio_reqp from the free list or allocate one
3329 * from dynamic memory.
3330 */
3331 if (error = aio_req_alloc(&reqp, resultp)) {
3332 mutex_exit(&aiop->aio_mutex);
3333 if (sqp)
3334 kmem_free(sqp, sizeof (sigqueue_t));
3335 return (error);
3336 }
3337 aiop->aio_pending++;
3338 aiop->aio_outstanding++;
3339 reqp->aio_req_flags = AIO_PENDING;
3340 if (old_solaris_req) {
3341 /* this is an old solaris aio request */
3342 reqp->aio_req_flags |= AIO_SOLARIS;
3343 aiop->aio_flags |= AIO_SOLARIS_REQ;
3344 }
3345 if (sigev->sigev_notify == SIGEV_THREAD ||
3346 sigev->sigev_notify == SIGEV_PORT)
3347 aio_enq(&aiop->aio_portpending, reqp, 0);
3348 mutex_exit(&aiop->aio_mutex);
3349 /*
3350 * initialize aio request.
3351 */
3352 reqp->aio_req_fd = arg->aio_fildes;
3353 reqp->aio_req_sigqp = sqp;
3354 reqp->aio_req_iocb.iocb = NULL;
3355 reqp->aio_req_lio = NULL;
3356 reqp->aio_req_buf.b_file = vp;
3357 uio = reqp->aio_req.aio_uio;
3358 uio->uio_iovcnt = 1;
3359 uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
3360 uio->uio_iov->iov_len = arg->aio_nbytes;
3361 uio->uio_loffset = arg->aio_offset;
3362 *reqpp = reqp;
3363 return (0);
3364 }
3365
3366 /*
3367 * This routine is called when a non largefile call is made by a 32bit
3368 * process on a ILP32 or LP64 kernel.
3369 */
3370 static int
alio32(int mode_arg,void * aiocb_arg,int nent,void * sigev)3371 alio32(
3372 int mode_arg,
3373 void *aiocb_arg,
3374 int nent,
3375 void *sigev)
3376 {
3377 file_t *fp;
3378 file_t *prev_fp = NULL;
3379 int prev_mode = -1;
3380 struct vnode *vp;
3381 aio_lio_t *head;
3382 aio_req_t *reqp;
3383 aio_t *aiop;
3384 caddr_t cbplist;
3385 aiocb_t cb;
3386 aiocb_t *aiocb = &cb;
3387 #ifdef _LP64
3388 aiocb32_t *cbp;
3389 caddr32_t *ucbp;
3390 aiocb32_t cb32;
3391 aiocb32_t *aiocb32 = &cb32;
3392 struct sigevent32 sigevk;
3393 #else
3394 aiocb_t *cbp, **ucbp;
3395 struct sigevent sigevk;
3396 #endif
3397 sigqueue_t *sqp;
3398 int (*aio_func)();
3399 int mode;
3400 int error = 0;
3401 int aio_errors = 0;
3402 int i;
3403 size_t ssize;
3404 int deadhead = 0;
3405 int aio_notsupported = 0;
3406 int lio_head_port;
3407 int aio_port;
3408 int aio_thread;
3409 port_kevent_t *pkevtp = NULL;
3410 int portused = 0;
3411 #ifdef _LP64
3412 port_notify32_t pnotify;
3413 #else
3414 port_notify_t pnotify;
3415 #endif
3416 int event;
3417
3418 aiop = curproc->p_aio;
3419 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
3420 return (EINVAL);
3421
3422 #ifdef _LP64
3423 ssize = (sizeof (caddr32_t) * nent);
3424 #else
3425 ssize = (sizeof (aiocb_t *) * nent);
3426 #endif
3427 cbplist = kmem_alloc(ssize, KM_SLEEP);
3428 ucbp = (void *)cbplist;
3429
3430 if (copyin(aiocb_arg, cbplist, ssize) ||
3431 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) {
3432 kmem_free(cbplist, ssize);
3433 return (EFAULT);
3434 }
3435
3436 /* Event Ports */
3437 if (sigev &&
3438 (sigevk.sigev_notify == SIGEV_THREAD ||
3439 sigevk.sigev_notify == SIGEV_PORT)) {
3440 if (sigevk.sigev_notify == SIGEV_THREAD) {
3441 pnotify.portnfy_port = sigevk.sigev_signo;
3442 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
3443 } else if (copyin(
3444 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
3445 &pnotify, sizeof (pnotify))) {
3446 kmem_free(cbplist, ssize);
3447 return (EFAULT);
3448 }
3449 error = port_alloc_event(pnotify.portnfy_port,
3450 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
3451 if (error) {
3452 if (error == ENOMEM || error == EAGAIN)
3453 error = EAGAIN;
3454 else
3455 error = EINVAL;
3456 kmem_free(cbplist, ssize);
3457 return (error);
3458 }
3459 lio_head_port = pnotify.portnfy_port;
3460 portused = 1;
3461 }
3462
3463 /*
3464 * a list head should be allocated if notification is
3465 * enabled for this list.
3466 */
3467 head = NULL;
3468
3469 if (mode_arg == LIO_WAIT || sigev) {
3470 mutex_enter(&aiop->aio_mutex);
3471 error = aio_lio_alloc(&head);
3472 mutex_exit(&aiop->aio_mutex);
3473 if (error)
3474 goto done;
3475 deadhead = 1;
3476 head->lio_nent = nent;
3477 head->lio_refcnt = nent;
3478 head->lio_port = -1;
3479 head->lio_portkev = NULL;
3480 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
3481 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
3482 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3483 if (sqp == NULL) {
3484 error = EAGAIN;
3485 goto done;
3486 }
3487 sqp->sq_func = NULL;
3488 sqp->sq_next = NULL;
3489 sqp->sq_info.si_code = SI_ASYNCIO;
3490 sqp->sq_info.si_pid = curproc->p_pid;
3491 sqp->sq_info.si_ctid = PRCTID(curproc);
3492 sqp->sq_info.si_zoneid = getzoneid();
3493 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3494 sqp->sq_info.si_signo = sigevk.sigev_signo;
3495 sqp->sq_info.si_value.sival_int =
3496 sigevk.sigev_value.sival_int;
3497 head->lio_sigqp = sqp;
3498 } else {
3499 head->lio_sigqp = NULL;
3500 }
3501 if (pkevtp) {
3502 /*
3503 * Prepare data to send when list of aiocb's has
3504 * completed.
3505 */
3506 port_init_event(pkevtp, (uintptr_t)sigev,
3507 (void *)(uintptr_t)pnotify.portnfy_user,
3508 NULL, head);
3509 pkevtp->portkev_events = AIOLIO;
3510 head->lio_portkev = pkevtp;
3511 head->lio_port = pnotify.portnfy_port;
3512 }
3513 }
3514
3515 for (i = 0; i < nent; i++, ucbp++) {
3516
3517 /* skip entry if it can't be copied. */
3518 #ifdef _LP64
3519 cbp = (aiocb32_t *)(uintptr_t)*ucbp;
3520 if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32)))
3521 #else
3522 cbp = (aiocb_t *)*ucbp;
3523 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb)))
3524 #endif
3525 {
3526 if (head) {
3527 mutex_enter(&aiop->aio_mutex);
3528 head->lio_nent--;
3529 head->lio_refcnt--;
3530 mutex_exit(&aiop->aio_mutex);
3531 }
3532 continue;
3533 }
3534 #ifdef _LP64
3535 /*
3536 * copy 32 bit structure into 64 bit structure
3537 */
3538 aiocb_32ton(aiocb32, aiocb);
3539 #endif /* _LP64 */
3540
3541 /* skip if opcode for aiocb is LIO_NOP */
3542 mode = aiocb->aio_lio_opcode;
3543 if (mode == LIO_NOP) {
3544 cbp = NULL;
3545 if (head) {
3546 mutex_enter(&aiop->aio_mutex);
3547 head->lio_nent--;
3548 head->lio_refcnt--;
3549 mutex_exit(&aiop->aio_mutex);
3550 }
3551 continue;
3552 }
3553
3554 /* increment file descriptor's ref count. */
3555 if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3556 lio_set_uerror(&cbp->aio_resultp, EBADF);
3557 if (head) {
3558 mutex_enter(&aiop->aio_mutex);
3559 head->lio_nent--;
3560 head->lio_refcnt--;
3561 mutex_exit(&aiop->aio_mutex);
3562 }
3563 aio_errors++;
3564 continue;
3565 }
3566
3567 /*
3568 * check the permission of the partition
3569 */
3570 if ((fp->f_flag & mode) == 0) {
3571 releasef(aiocb->aio_fildes);
3572 lio_set_uerror(&cbp->aio_resultp, EBADF);
3573 if (head) {
3574 mutex_enter(&aiop->aio_mutex);
3575 head->lio_nent--;
3576 head->lio_refcnt--;
3577 mutex_exit(&aiop->aio_mutex);
3578 }
3579 aio_errors++;
3580 continue;
3581 }
3582
3583 /*
3584 * common case where requests are to the same fd
3585 * for the same r/w operation
3586 * for UFS, need to set EBADFD
3587 */
3588 vp = fp->f_vnode;
3589 if (fp != prev_fp || mode != prev_mode) {
3590 aio_func = check_vp(vp, mode);
3591 if (aio_func == NULL) {
3592 prev_fp = NULL;
3593 releasef(aiocb->aio_fildes);
3594 lio_set_uerror(&cbp->aio_resultp, EBADFD);
3595 aio_notsupported++;
3596 if (head) {
3597 mutex_enter(&aiop->aio_mutex);
3598 head->lio_nent--;
3599 head->lio_refcnt--;
3600 mutex_exit(&aiop->aio_mutex);
3601 }
3602 continue;
3603 } else {
3604 prev_fp = fp;
3605 prev_mode = mode;
3606 }
3607 }
3608
3609 error = aio_req_setup(&reqp, aiop, aiocb,
3610 (aio_result_t *)&cbp->aio_resultp, vp, 0);
3611 if (error) {
3612 releasef(aiocb->aio_fildes);
3613 lio_set_uerror(&cbp->aio_resultp, error);
3614 if (head) {
3615 mutex_enter(&aiop->aio_mutex);
3616 head->lio_nent--;
3617 head->lio_refcnt--;
3618 mutex_exit(&aiop->aio_mutex);
3619 }
3620 aio_errors++;
3621 continue;
3622 }
3623
3624 reqp->aio_req_lio = head;
3625 deadhead = 0;
3626
3627 /*
3628 * Set the errno field now before sending the request to
3629 * the driver to avoid a race condition
3630 */
3631 (void) suword32(&cbp->aio_resultp.aio_errno,
3632 EINPROGRESS);
3633
3634 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp;
3635
3636 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
3637 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3638 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3639 if (aio_port | aio_thread) {
3640 port_kevent_t *lpkevp;
3641 /*
3642 * Prepare data to send with each aiocb completed.
3643 */
3644 #ifdef _LP64
3645 if (aio_port) {
3646 void *paddr = (void *)(uintptr_t)
3647 aiocb32->aio_sigevent.sigev_value.sival_ptr;
3648 if (copyin(paddr, &pnotify, sizeof (pnotify)))
3649 error = EFAULT;
3650 } else { /* aio_thread */
3651 pnotify.portnfy_port =
3652 aiocb32->aio_sigevent.sigev_signo;
3653 pnotify.portnfy_user =
3654 aiocb32->aio_sigevent.sigev_value.sival_ptr;
3655 }
3656 #else
3657 if (aio_port) {
3658 void *paddr =
3659 aiocb->aio_sigevent.sigev_value.sival_ptr;
3660 if (copyin(paddr, &pnotify, sizeof (pnotify)))
3661 error = EFAULT;
3662 } else { /* aio_thread */
3663 pnotify.portnfy_port =
3664 aiocb->aio_sigevent.sigev_signo;
3665 pnotify.portnfy_user =
3666 aiocb->aio_sigevent.sigev_value.sival_ptr;
3667 }
3668 #endif
3669 if (error)
3670 /* EMPTY */;
3671 else if (pkevtp != NULL &&
3672 pnotify.portnfy_port == lio_head_port)
3673 error = port_dup_event(pkevtp, &lpkevp,
3674 PORT_ALLOC_DEFAULT);
3675 else
3676 error = port_alloc_event(pnotify.portnfy_port,
3677 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3678 &lpkevp);
3679 if (error == 0) {
3680 port_init_event(lpkevp, (uintptr_t)cbp,
3681 (void *)(uintptr_t)pnotify.portnfy_user,
3682 aio_port_callback, reqp);
3683 lpkevp->portkev_events = event;
3684 reqp->aio_req_portkev = lpkevp;
3685 reqp->aio_req_port = pnotify.portnfy_port;
3686 }
3687 }
3688
3689 /*
3690 * send the request to driver.
3691 */
3692 if (error == 0) {
3693 if (aiocb->aio_nbytes == 0) {
3694 clear_active_fd(aiocb->aio_fildes);
3695 aio_zerolen(reqp);
3696 continue;
3697 }
3698 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3699 CRED());
3700 }
3701
3702 /*
3703 * the fd's ref count is not decremented until the IO has
3704 * completed unless there was an error.
3705 */
3706 if (error) {
3707 releasef(aiocb->aio_fildes);
3708 lio_set_uerror(&cbp->aio_resultp, error);
3709 if (head) {
3710 mutex_enter(&aiop->aio_mutex);
3711 head->lio_nent--;
3712 head->lio_refcnt--;
3713 mutex_exit(&aiop->aio_mutex);
3714 }
3715 if (error == ENOTSUP)
3716 aio_notsupported++;
3717 else
3718 aio_errors++;
3719 lio_set_error(reqp, portused);
3720 } else {
3721 clear_active_fd(aiocb->aio_fildes);
3722 }
3723 }
3724
3725 if (aio_notsupported) {
3726 error = ENOTSUP;
3727 } else if (aio_errors) {
3728 /*
3729 * return EIO if any request failed
3730 */
3731 error = EIO;
3732 }
3733
3734 if (mode_arg == LIO_WAIT) {
3735 mutex_enter(&aiop->aio_mutex);
3736 while (head->lio_refcnt > 0) {
3737 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3738 mutex_exit(&aiop->aio_mutex);
3739 error = EINTR;
3740 goto done;
3741 }
3742 }
3743 mutex_exit(&aiop->aio_mutex);
3744 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32);
3745 }
3746
3747 done:
3748 kmem_free(cbplist, ssize);
3749 if (deadhead) {
3750 if (head->lio_sigqp)
3751 kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3752 if (head->lio_portkev)
3753 port_free_event(head->lio_portkev);
3754 kmem_free(head, sizeof (aio_lio_t));
3755 }
3756 return (error);
3757 }
3758
3759
3760 #ifdef _SYSCALL32_IMPL
3761 void
aiocb_32ton(aiocb32_t * src,aiocb_t * dest)3762 aiocb_32ton(aiocb32_t *src, aiocb_t *dest)
3763 {
3764 dest->aio_fildes = src->aio_fildes;
3765 dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf;
3766 dest->aio_nbytes = (size_t)src->aio_nbytes;
3767 dest->aio_offset = (off_t)src->aio_offset;
3768 dest->aio_reqprio = src->aio_reqprio;
3769 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3770 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3771
3772 /*
3773 * See comment in sigqueue32() on handling of 32-bit
3774 * sigvals in a 64-bit kernel.
3775 */
3776 dest->aio_sigevent.sigev_value.sival_int =
3777 (int)src->aio_sigevent.sigev_value.sival_int;
3778 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3779 (uintptr_t)src->aio_sigevent.sigev_notify_function;
3780 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3781 (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3782 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3783 dest->aio_lio_opcode = src->aio_lio_opcode;
3784 dest->aio_state = src->aio_state;
3785 dest->aio__pad[0] = src->aio__pad[0];
3786 }
3787 #endif /* _SYSCALL32_IMPL */
3788
3789 /*
3790 * aio_port_callback() is called just before the event is retrieved from the
3791 * port. The task of this callback function is to finish the work of the
3792 * transaction for the application, it means :
3793 * - copyout transaction data to the application
3794 * (this thread is running in the right process context)
3795 * - keep trace of the transaction (update of counters).
3796 * - free allocated buffers
3797 * The aiocb pointer is the object element of the port_kevent_t structure.
3798 *
3799 * flag :
3800 * PORT_CALLBACK_DEFAULT : do copyout and free resources
3801 * PORT_CALLBACK_CLOSE : don't do copyout, free resources
3802 */
3803
3804 /*ARGSUSED*/
3805 int
aio_port_callback(void * arg,int * events,pid_t pid,int flag,void * evp)3806 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp)
3807 {
3808 aio_t *aiop = curproc->p_aio;
3809 aio_req_t *reqp = arg;
3810 struct iovec *iov;
3811 struct buf *bp;
3812 void *resultp;
3813
3814 if (pid != curproc->p_pid) {
3815 /* wrong proc !!, can not deliver data here ... */
3816 return (EACCES);
3817 }
3818
3819 mutex_enter(&aiop->aio_portq_mutex);
3820 reqp->aio_req_portkev = NULL;
3821 aio_req_remove_portq(aiop, reqp); /* remove request from portq */
3822 mutex_exit(&aiop->aio_portq_mutex);
3823 aphysio_unlock(reqp); /* unlock used pages */
3824 mutex_enter(&aiop->aio_mutex);
3825 if (reqp->aio_req_flags & AIO_COPYOUTDONE) {
3826 aio_req_free_port(aiop, reqp); /* back to free list */
3827 mutex_exit(&aiop->aio_mutex);
3828 return (0);
3829 }
3830
3831 iov = reqp->aio_req_uio.uio_iov;
3832 bp = &reqp->aio_req_buf;
3833 resultp = (void *)reqp->aio_req_resultp;
3834 aio_req_free_port(aiop, reqp); /* request struct back to free list */
3835 mutex_exit(&aiop->aio_mutex);
3836 if (flag == PORT_CALLBACK_DEFAULT)
3837 aio_copyout_result_port(iov, bp, resultp);
3838 return (0);
3839 }
3840