1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright (c) 2018, Joyent, Inc.
29 */
30
31 /*
32 * Kernel asynchronous I/O.
33 * This is only for raw devices now (as of Nov. 1993).
34 */
35
36 #include <sys/types.h>
37 #include <sys/errno.h>
38 #include <sys/conf.h>
39 #include <sys/file.h>
40 #include <sys/fs/snode.h>
41 #include <sys/unistd.h>
42 #include <sys/cmn_err.h>
43 #include <vm/as.h>
44 #include <vm/faultcode.h>
45 #include <sys/sysmacros.h>
46 #include <sys/procfs.h>
47 #include <sys/kmem.h>
48 #include <sys/autoconf.h>
49 #include <sys/ddi_impldefs.h>
50 #include <sys/sunddi.h>
51 #include <sys/aio_impl.h>
52 #include <sys/debug.h>
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/vmsystm.h>
56 #include <sys/fs/pxfs_ki.h>
57 #include <sys/contract/process_impl.h>
58
59 /*
60 * external entry point.
61 */
62 #ifdef _LP64
63 static int64_t kaioc(long, long, long, long, long, long);
64 #endif
65 static int kaio(ulong_t *, rval_t *);
66
67
68 #define AIO_64 0
69 #define AIO_32 1
70 #define AIO_LARGEFILE 2
71
72 /*
73 * implementation specific functions (private)
74 */
75 #ifdef _LP64
76 static int alio(int, aiocb_t **, int, struct sigevent *);
77 #endif
78 static int aionotify(void);
79 static int aioinit(void);
80 static int aiostart(void);
81 static void alio_cleanup(aio_t *, aiocb_t **, int, int);
82 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
83 cred_t *);
84 static void lio_set_error(aio_req_t *, int portused);
85 static aio_t *aio_aiop_alloc();
86 static int aio_req_alloc(aio_req_t **, aio_result_t *);
87 static int aio_lio_alloc(aio_lio_t **);
88 static aio_req_t *aio_req_done(void *);
89 static aio_req_t *aio_req_remove(aio_req_t *);
90 static int aio_req_find(aio_result_t *, aio_req_t **);
91 static int aio_hash_insert(struct aio_req_t *, aio_t *);
92 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
93 aio_result_t *, vnode_t *, int);
94 static int aio_cleanup_thread(aio_t *);
95 static aio_lio_t *aio_list_get(aio_result_t *);
96 static void lio_set_uerror(void *, int);
97 extern void aio_zerolen(aio_req_t *);
98 static int aiowait(struct timeval *, int, long *);
99 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
100 static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
101 aio_req_t *reqlist, aio_t *aiop, model_t model);
102 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
103 static int aiosuspend(void *, int, struct timespec *, int,
104 long *, int);
105 static int aliowait(int, void *, int, void *, int);
106 static int aioerror(void *, int);
107 static int aio_cancel(int, void *, long *, int);
108 static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
109 static int aiorw(int, void *, int, int);
110
111 static int alioLF(int, void *, int, void *);
112 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *,
113 aio_result_t *, vnode_t *, int);
114 static int alio32(int, void *, int, void *);
115 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
116 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
117
118 #ifdef _SYSCALL32_IMPL
119 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
120 void aiocb_32ton(aiocb32_t *, aiocb_t *);
121 #endif /* _SYSCALL32_IMPL */
122
123 /*
124 * implementation specific functions (external)
125 */
126 void aio_req_free(aio_t *, aio_req_t *);
127
128 /*
129 * Event Port framework
130 */
131
132 void aio_req_free_port(aio_t *, aio_req_t *);
133 static int aio_port_callback(void *, int *, pid_t, int, void *);
134
135 /*
136 * This is the loadable module wrapper.
137 */
138 #include <sys/modctl.h>
139 #include <sys/syscall.h>
140
141 #ifdef _LP64
142
143 static struct sysent kaio_sysent = {
144 6,
145 SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
146 (int (*)())(uintptr_t)kaioc
147 };
148
149 #ifdef _SYSCALL32_IMPL
150 static struct sysent kaio_sysent32 = {
151 7,
152 SE_NOUNLOAD | SE_64RVAL,
153 kaio
154 };
155 #endif /* _SYSCALL32_IMPL */
156
157 #else /* _LP64 */
158
159 static struct sysent kaio_sysent = {
160 7,
161 SE_NOUNLOAD | SE_32RVAL1,
162 kaio
163 };
164
165 #endif /* _LP64 */
166
167 /*
168 * Module linkage information for the kernel.
169 */
170
171 static struct modlsys modlsys = {
172 &mod_syscallops,
173 "kernel Async I/O",
174 &kaio_sysent
175 };
176
177 #ifdef _SYSCALL32_IMPL
178 static struct modlsys modlsys32 = {
179 &mod_syscallops32,
180 "kernel Async I/O for 32 bit compatibility",
181 &kaio_sysent32
182 };
183 #endif /* _SYSCALL32_IMPL */
184
185
186 static struct modlinkage modlinkage = {
187 MODREV_1,
188 &modlsys,
189 #ifdef _SYSCALL32_IMPL
190 &modlsys32,
191 #endif
192 NULL
193 };
194
195 int
_init(void)196 _init(void)
197 {
198 int retval;
199
200 if ((retval = mod_install(&modlinkage)) != 0)
201 return (retval);
202
203 return (0);
204 }
205
206 int
_fini(void)207 _fini(void)
208 {
209 int retval;
210
211 retval = mod_remove(&modlinkage);
212
213 return (retval);
214 }
215
216 int
_info(struct modinfo * modinfop)217 _info(struct modinfo *modinfop)
218 {
219 return (mod_info(&modlinkage, modinfop));
220 }
221
222 #ifdef _LP64
223 static int64_t
kaioc(long a0,long a1,long a2,long a3,long a4,long a5)224 kaioc(
225 long a0,
226 long a1,
227 long a2,
228 long a3,
229 long a4,
230 long a5)
231 {
232 int error;
233 long rval = 0;
234
235 switch ((int)a0 & ~AIO_POLL_BIT) {
236 case AIOREAD:
237 error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
238 (offset_t)a4, (aio_result_t *)a5, FREAD);
239 break;
240 case AIOWRITE:
241 error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
242 (offset_t)a4, (aio_result_t *)a5, FWRITE);
243 break;
244 case AIOWAIT:
245 error = aiowait((struct timeval *)a1, (int)a2, &rval);
246 break;
247 case AIOWAITN:
248 error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
249 (timespec_t *)a4);
250 break;
251 case AIONOTIFY:
252 error = aionotify();
253 break;
254 case AIOINIT:
255 error = aioinit();
256 break;
257 case AIOSTART:
258 error = aiostart();
259 break;
260 case AIOLIO:
261 error = alio((int)a1, (aiocb_t **)a2, (int)a3,
262 (struct sigevent *)a4);
263 break;
264 case AIOLIOWAIT:
265 error = aliowait((int)a1, (void *)a2, (int)a3,
266 (struct sigevent *)a4, AIO_64);
267 break;
268 case AIOSUSPEND:
269 error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
270 (int)a4, &rval, AIO_64);
271 break;
272 case AIOERROR:
273 error = aioerror((void *)a1, AIO_64);
274 break;
275 case AIOAREAD:
276 error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
277 break;
278 case AIOAWRITE:
279 error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
280 break;
281 case AIOCANCEL:
282 error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
283 break;
284
285 /*
286 * The large file related stuff is valid only for
287 * 32 bit kernel and not for 64 bit kernel
288 * On 64 bit kernel we convert large file calls
289 * to regular 64bit calls.
290 */
291
292 default:
293 error = EINVAL;
294 }
295 if (error)
296 return ((int64_t)set_errno(error));
297 return (rval);
298 }
299 #endif
300
301 static int
kaio(ulong_t * uap,rval_t * rvp)302 kaio(
303 ulong_t *uap,
304 rval_t *rvp)
305 {
306 long rval = 0;
307 int error = 0;
308 offset_t off;
309
310
311 rvp->r_vals = 0;
312 #if defined(_LITTLE_ENDIAN)
313 off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
314 #else
315 off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
316 #endif
317
318 switch (uap[0] & ~AIO_POLL_BIT) {
319 /*
320 * It must be the 32 bit system call on 64 bit kernel
321 */
322 case AIOREAD:
323 return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
324 (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
325 case AIOWRITE:
326 return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
327 (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
328 case AIOWAIT:
329 error = aiowait((struct timeval *)uap[1], (int)uap[2],
330 &rval);
331 break;
332 case AIOWAITN:
333 error = aiowaitn((void *)uap[1], (uint_t)uap[2],
334 (uint_t *)uap[3], (timespec_t *)uap[4]);
335 break;
336 case AIONOTIFY:
337 return (aionotify());
338 case AIOINIT:
339 return (aioinit());
340 case AIOSTART:
341 return (aiostart());
342 case AIOLIO:
343 return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
344 (void *)uap[4]));
345 case AIOLIOWAIT:
346 return (aliowait((int)uap[1], (void *)uap[2],
347 (int)uap[3], (struct sigevent *)uap[4], AIO_32));
348 case AIOSUSPEND:
349 error = aiosuspend((void *)uap[1], (int)uap[2],
350 (timespec_t *)uap[3], (int)uap[4],
351 &rval, AIO_32);
352 break;
353 case AIOERROR:
354 return (aioerror((void *)uap[1], AIO_32));
355 case AIOAREAD:
356 return (aiorw((int)uap[0], (void *)uap[1],
357 FREAD, AIO_32));
358 case AIOAWRITE:
359 return (aiorw((int)uap[0], (void *)uap[1],
360 FWRITE, AIO_32));
361 case AIOCANCEL:
362 error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
363 AIO_32));
364 break;
365 case AIOLIO64:
366 return (alioLF((int)uap[1], (void *)uap[2],
367 (int)uap[3], (void *)uap[4]));
368 case AIOLIOWAIT64:
369 return (aliowait(uap[1], (void *)uap[2],
370 (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
371 case AIOSUSPEND64:
372 error = aiosuspend((void *)uap[1], (int)uap[2],
373 (timespec_t *)uap[3], (int)uap[4], &rval,
374 AIO_LARGEFILE);
375 break;
376 case AIOERROR64:
377 return (aioerror((void *)uap[1], AIO_LARGEFILE));
378 case AIOAREAD64:
379 return (aiorw((int)uap[0], (void *)uap[1], FREAD,
380 AIO_LARGEFILE));
381 case AIOAWRITE64:
382 return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
383 AIO_LARGEFILE));
384 case AIOCANCEL64:
385 error = (aio_cancel((int)uap[1], (void *)uap[2],
386 &rval, AIO_LARGEFILE));
387 break;
388 default:
389 return (EINVAL);
390 }
391
392 rvp->r_val1 = rval;
393 return (error);
394 }
395
396 /*
397 * wake up LWPs in this process that are sleeping in
398 * aiowait().
399 */
400 static int
aionotify(void)401 aionotify(void)
402 {
403 aio_t *aiop;
404
405 aiop = curproc->p_aio;
406 if (aiop == NULL)
407 return (0);
408
409 mutex_enter(&aiop->aio_mutex);
410 aiop->aio_notifycnt++;
411 cv_broadcast(&aiop->aio_waitcv);
412 mutex_exit(&aiop->aio_mutex);
413
414 return (0);
415 }
416
417 static int
timeval2reltime(struct timeval * timout,timestruc_t * rqtime,timestruc_t ** rqtp,int * blocking)418 timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
419 timestruc_t **rqtp, int *blocking)
420 {
421 #ifdef _SYSCALL32_IMPL
422 struct timeval32 wait_time_32;
423 #endif
424 struct timeval wait_time;
425 model_t model = get_udatamodel();
426
427 *rqtp = NULL;
428 if (timout == NULL) { /* wait indefinitely */
429 *blocking = 1;
430 return (0);
431 }
432
433 /*
434 * Need to correctly compare with the -1 passed in for a user
435 * address pointer, with both 32 bit and 64 bit apps.
436 */
437 if (model == DATAMODEL_NATIVE) {
438 if ((intptr_t)timout == (intptr_t)-1) { /* don't wait */
439 *blocking = 0;
440 return (0);
441 }
442
443 if (copyin(timout, &wait_time, sizeof (wait_time)))
444 return (EFAULT);
445 }
446 #ifdef _SYSCALL32_IMPL
447 else {
448 /*
449 * -1 from a 32bit app. It will not get sign extended.
450 * don't wait if -1.
451 */
452 if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
453 *blocking = 0;
454 return (0);
455 }
456
457 if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
458 return (EFAULT);
459 TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
460 }
461 #endif /* _SYSCALL32_IMPL */
462
463 if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) { /* don't wait */
464 *blocking = 0;
465 return (0);
466 }
467
468 if (wait_time.tv_sec < 0 ||
469 wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
470 return (EINVAL);
471
472 rqtime->tv_sec = wait_time.tv_sec;
473 rqtime->tv_nsec = wait_time.tv_usec * 1000;
474 *rqtp = rqtime;
475 *blocking = 1;
476
477 return (0);
478 }
479
480 static int
timespec2reltime(timespec_t * timout,timestruc_t * rqtime,timestruc_t ** rqtp,int * blocking)481 timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
482 timestruc_t **rqtp, int *blocking)
483 {
484 #ifdef _SYSCALL32_IMPL
485 timespec32_t wait_time_32;
486 #endif
487 model_t model = get_udatamodel();
488
489 *rqtp = NULL;
490 if (timout == NULL) {
491 *blocking = 1;
492 return (0);
493 }
494
495 if (model == DATAMODEL_NATIVE) {
496 if (copyin(timout, rqtime, sizeof (*rqtime)))
497 return (EFAULT);
498 }
499 #ifdef _SYSCALL32_IMPL
500 else {
501 if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
502 return (EFAULT);
503 TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
504 }
505 #endif /* _SYSCALL32_IMPL */
506
507 if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
508 *blocking = 0;
509 return (0);
510 }
511
512 if (rqtime->tv_sec < 0 ||
513 rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
514 return (EINVAL);
515
516 *rqtp = rqtime;
517 *blocking = 1;
518
519 return (0);
520 }
521
522 /*ARGSUSED*/
523 static int
aiowait(struct timeval * timout,int dontblockflg,long * rval)524 aiowait(struct timeval *timout, int dontblockflg, long *rval)
525 {
526 int error;
527 aio_t *aiop;
528 aio_req_t *reqp;
529 clock_t status;
530 int blocking;
531 int timecheck;
532 timestruc_t rqtime;
533 timestruc_t *rqtp;
534
535 aiop = curproc->p_aio;
536 if (aiop == NULL)
537 return (EINVAL);
538
539 /*
540 * Establish the absolute future time for the timeout.
541 */
542 error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
543 if (error)
544 return (error);
545 if (rqtp) {
546 timestruc_t now;
547 timecheck = timechanged;
548 gethrestime(&now);
549 timespecadd(rqtp, &now);
550 }
551
552 mutex_enter(&aiop->aio_mutex);
553 for (;;) {
554 /* process requests on poll queue */
555 if (aiop->aio_pollq) {
556 mutex_exit(&aiop->aio_mutex);
557 aio_cleanup(0);
558 mutex_enter(&aiop->aio_mutex);
559 }
560 if ((reqp = aio_req_remove(NULL)) != NULL) {
561 *rval = (long)reqp->aio_req_resultp;
562 break;
563 }
564 /* user-level done queue might not be empty */
565 if (aiop->aio_notifycnt > 0) {
566 aiop->aio_notifycnt--;
567 *rval = 1;
568 break;
569 }
570 /* don't block if no outstanding aio */
571 if (aiop->aio_outstanding == 0 && dontblockflg) {
572 error = EINVAL;
573 break;
574 }
575 if (blocking) {
576 status = cv_waituntil_sig(&aiop->aio_waitcv,
577 &aiop->aio_mutex, rqtp, timecheck);
578
579 if (status > 0) /* check done queue again */
580 continue;
581 if (status == 0) { /* interrupted by a signal */
582 error = EINTR;
583 *rval = -1;
584 } else { /* timer expired */
585 error = ETIME;
586 }
587 }
588 break;
589 }
590 mutex_exit(&aiop->aio_mutex);
591 if (reqp) {
592 aphysio_unlock(reqp);
593 aio_copyout_result(reqp);
594 mutex_enter(&aiop->aio_mutex);
595 aio_req_free(aiop, reqp);
596 mutex_exit(&aiop->aio_mutex);
597 }
598 return (error);
599 }
600
601 /*
602 * aiowaitn can be used to reap completed asynchronous requests submitted with
603 * lio_listio, aio_read or aio_write.
604 * This function only reaps asynchronous raw I/Os.
605 */
606
607 /*ARGSUSED*/
608 static int
aiowaitn(void * uiocb,uint_t nent,uint_t * nwait,timespec_t * timout)609 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
610 {
611 int error = 0;
612 aio_t *aiop;
613 aio_req_t *reqlist = NULL;
614 caddr_t iocblist = NULL; /* array of iocb ptr's */
615 uint_t waitcnt, cnt = 0; /* iocb cnt */
616 size_t iocbsz; /* users iocb size */
617 size_t riocbsz; /* returned iocb size */
618 int iocb_index = 0;
619 model_t model = get_udatamodel();
620 int blocking = 1;
621 int timecheck;
622 timestruc_t rqtime;
623 timestruc_t *rqtp;
624
625 aiop = curproc->p_aio;
626 if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX)
627 return (EINVAL);
628
629 if (aiop->aio_outstanding == 0)
630 return (EAGAIN);
631
632 if (copyin(nwait, &waitcnt, sizeof (uint_t)))
633 return (EFAULT);
634
635 /* set *nwait to zero, if we must return prematurely */
636 if (copyout(&cnt, nwait, sizeof (uint_t)))
637 return (EFAULT);
638
639 if (waitcnt == 0) {
640 blocking = 0;
641 rqtp = NULL;
642 waitcnt = nent;
643 } else {
644 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
645 if (error)
646 return (error);
647 }
648
649 if (model == DATAMODEL_NATIVE)
650 iocbsz = (sizeof (aiocb_t *) * nent);
651 #ifdef _SYSCALL32_IMPL
652 else
653 iocbsz = (sizeof (caddr32_t) * nent);
654 #endif /* _SYSCALL32_IMPL */
655
656 /*
657 * Only one aio_waitn call is allowed at a time.
658 * The active aio_waitn will collect all requests
659 * out of the "done" list and if necessary it will wait
660 * for some/all pending requests to fulfill the nwait
661 * parameter.
662 * A second or further aio_waitn calls will sleep here
663 * until the active aio_waitn finishes and leaves the kernel
664 * If the second call does not block (poll), then return
665 * immediately with the error code : EAGAIN.
666 * If the second call should block, then sleep here, but
667 * do not touch the timeout. The timeout starts when this
668 * aio_waitn-call becomes active.
669 */
670
671 mutex_enter(&aiop->aio_mutex);
672
673 while (aiop->aio_flags & AIO_WAITN) {
674 if (blocking == 0) {
675 mutex_exit(&aiop->aio_mutex);
676 return (EAGAIN);
677 }
678
679 /* block, no timeout */
680 aiop->aio_flags |= AIO_WAITN_PENDING;
681 if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
682 mutex_exit(&aiop->aio_mutex);
683 return (EINTR);
684 }
685 }
686
687 /*
688 * Establish the absolute future time for the timeout.
689 */
690 if (rqtp) {
691 timestruc_t now;
692 timecheck = timechanged;
693 gethrestime(&now);
694 timespecadd(rqtp, &now);
695 }
696
697 if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
698 kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
699 aiop->aio_iocb = NULL;
700 }
701
702 if (aiop->aio_iocb == NULL) {
703 iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
704 if (iocblist == NULL) {
705 mutex_exit(&aiop->aio_mutex);
706 return (ENOMEM);
707 }
708 aiop->aio_iocb = (aiocb_t **)iocblist;
709 aiop->aio_iocbsz = iocbsz;
710 } else {
711 iocblist = (char *)aiop->aio_iocb;
712 }
713
714 aiop->aio_waitncnt = waitcnt;
715 aiop->aio_flags |= AIO_WAITN;
716
717 for (;;) {
718 /* push requests on poll queue to done queue */
719 if (aiop->aio_pollq) {
720 mutex_exit(&aiop->aio_mutex);
721 aio_cleanup(0);
722 mutex_enter(&aiop->aio_mutex);
723 }
724
725 /* check for requests on done queue */
726 if (aiop->aio_doneq) {
727 cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
728 aiop->aio_waitncnt = waitcnt - cnt;
729 }
730
731 /* user-level done queue might not be empty */
732 if (aiop->aio_notifycnt > 0) {
733 aiop->aio_notifycnt--;
734 error = 0;
735 break;
736 }
737
738 /*
739 * if we are here second time as a result of timer
740 * expiration, we reset error if there are enough
741 * aiocb's to satisfy request.
742 * We return also if all requests are already done
743 * and we picked up the whole done queue.
744 */
745
746 if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
747 aiop->aio_doneq == NULL)) {
748 error = 0;
749 break;
750 }
751
752 if ((cnt < waitcnt) && blocking) {
753 int rval = cv_waituntil_sig(&aiop->aio_waitcv,
754 &aiop->aio_mutex, rqtp, timecheck);
755 if (rval > 0)
756 continue;
757 if (rval < 0) {
758 error = ETIME;
759 blocking = 0;
760 continue;
761 }
762 error = EINTR;
763 }
764 break;
765 }
766
767 mutex_exit(&aiop->aio_mutex);
768
769 if (cnt > 0) {
770
771 iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
772 aiop, model);
773
774 if (model == DATAMODEL_NATIVE)
775 riocbsz = (sizeof (aiocb_t *) * cnt);
776 #ifdef _SYSCALL32_IMPL
777 else
778 riocbsz = (sizeof (caddr32_t) * cnt);
779 #endif /* _SYSCALL32_IMPL */
780
781 if (copyout(iocblist, uiocb, riocbsz) ||
782 copyout(&cnt, nwait, sizeof (uint_t)))
783 error = EFAULT;
784 }
785
786 /* check if there is another thread waiting for execution */
787 mutex_enter(&aiop->aio_mutex);
788 aiop->aio_flags &= ~AIO_WAITN;
789 if (aiop->aio_flags & AIO_WAITN_PENDING) {
790 aiop->aio_flags &= ~AIO_WAITN_PENDING;
791 cv_signal(&aiop->aio_waitncv);
792 }
793 mutex_exit(&aiop->aio_mutex);
794
795 return (error);
796 }
797
798 /*
799 * aio_unlock_requests
800 * copyouts the result of the request as well as the return value.
801 * It builds the list of completed asynchronous requests,
802 * unlocks the allocated memory ranges and
803 * put the aio request structure back into the free list.
804 */
805
806 static int
aio_unlock_requests(caddr_t iocblist,int iocb_index,aio_req_t * reqlist,aio_t * aiop,model_t model)807 aio_unlock_requests(
808 caddr_t iocblist,
809 int iocb_index,
810 aio_req_t *reqlist,
811 aio_t *aiop,
812 model_t model)
813 {
814 aio_req_t *reqp, *nreqp;
815
816 if (model == DATAMODEL_NATIVE) {
817 for (reqp = reqlist; reqp != NULL; reqp = nreqp) {
818 (((caddr_t *)iocblist)[iocb_index++]) =
819 reqp->aio_req_iocb.iocb;
820 nreqp = reqp->aio_req_next;
821 aphysio_unlock(reqp);
822 aio_copyout_result(reqp);
823 mutex_enter(&aiop->aio_mutex);
824 aio_req_free(aiop, reqp);
825 mutex_exit(&aiop->aio_mutex);
826 }
827 }
828 #ifdef _SYSCALL32_IMPL
829 else {
830 for (reqp = reqlist; reqp != NULL; reqp = nreqp) {
831 ((caddr32_t *)iocblist)[iocb_index++] =
832 reqp->aio_req_iocb.iocb32;
833 nreqp = reqp->aio_req_next;
834 aphysio_unlock(reqp);
835 aio_copyout_result(reqp);
836 mutex_enter(&aiop->aio_mutex);
837 aio_req_free(aiop, reqp);
838 mutex_exit(&aiop->aio_mutex);
839 }
840 }
841 #endif /* _SYSCALL32_IMPL */
842 return (iocb_index);
843 }
844
845 /*
846 * aio_reqlist_concat
847 * moves "max" elements from the done queue to the reqlist queue and removes
848 * the AIO_DONEQ flag.
849 * - reqlist queue is a simple linked list
850 * - done queue is a double linked list
851 */
852
853 static int
aio_reqlist_concat(aio_t * aiop,aio_req_t ** reqlist,int max)854 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
855 {
856 aio_req_t *q2, *q2work, *list;
857 int count = 0;
858
859 list = *reqlist;
860 q2 = aiop->aio_doneq;
861 q2work = q2;
862 while (max-- > 0) {
863 q2work->aio_req_flags &= ~AIO_DONEQ;
864 q2work = q2work->aio_req_next;
865 count++;
866 if (q2work == q2)
867 break;
868 }
869
870 if (q2work == q2) {
871 /* all elements revised */
872 q2->aio_req_prev->aio_req_next = list;
873 list = q2;
874 aiop->aio_doneq = NULL;
875 } else {
876 /*
877 * max < elements in the doneq
878 * detach only the required amount of elements
879 * out of the doneq
880 */
881 q2work->aio_req_prev->aio_req_next = list;
882 list = q2;
883
884 aiop->aio_doneq = q2work;
885 q2work->aio_req_prev = q2->aio_req_prev;
886 q2->aio_req_prev->aio_req_next = q2work;
887 }
888 *reqlist = list;
889 return (count);
890 }
891
892 /*ARGSUSED*/
893 static int
aiosuspend(void * aiocb,int nent,struct timespec * timout,int flag,long * rval,int run_mode)894 aiosuspend(void *aiocb, int nent, struct timespec *timout, int flag,
895 long *rval, int run_mode)
896 {
897 int error;
898 aio_t *aiop;
899 aio_req_t *reqp, *found, *next;
900 caddr_t cbplist = NULL;
901 aiocb_t *cbp, **ucbp;
902 #ifdef _SYSCALL32_IMPL
903 aiocb32_t *cbp32;
904 caddr32_t *ucbp32;
905 #endif /* _SYSCALL32_IMPL */
906 aiocb64_32_t *cbp64;
907 int rv;
908 int i;
909 size_t ssize;
910 model_t model = get_udatamodel();
911 int blocking;
912 int timecheck;
913 timestruc_t rqtime;
914 timestruc_t *rqtp;
915
916 aiop = curproc->p_aio;
917 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
918 return (EINVAL);
919
920 /*
921 * Establish the absolute future time for the timeout.
922 */
923 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
924 if (error)
925 return (error);
926 if (rqtp) {
927 timestruc_t now;
928 timecheck = timechanged;
929 gethrestime(&now);
930 timespecadd(rqtp, &now);
931 }
932
933 /*
934 * If we are not blocking and there's no IO complete
935 * skip aiocb copyin.
936 */
937 if (!blocking && (aiop->aio_pollq == NULL) &&
938 (aiop->aio_doneq == NULL)) {
939 return (EAGAIN);
940 }
941
942 if (model == DATAMODEL_NATIVE)
943 ssize = (sizeof (aiocb_t *) * nent);
944 #ifdef _SYSCALL32_IMPL
945 else
946 ssize = (sizeof (caddr32_t) * nent);
947 #endif /* _SYSCALL32_IMPL */
948
949 cbplist = kmem_alloc(ssize, KM_NOSLEEP);
950 if (cbplist == NULL)
951 return (ENOMEM);
952
953 if (copyin(aiocb, cbplist, ssize)) {
954 error = EFAULT;
955 goto done;
956 }
957
958 found = NULL;
959 /*
960 * we need to get the aio_cleanupq_mutex since we call
961 * aio_req_done().
962 */
963 mutex_enter(&aiop->aio_cleanupq_mutex);
964 mutex_enter(&aiop->aio_mutex);
965 for (;;) {
966 /* push requests on poll queue to done queue */
967 if (aiop->aio_pollq) {
968 mutex_exit(&aiop->aio_mutex);
969 mutex_exit(&aiop->aio_cleanupq_mutex);
970 aio_cleanup(0);
971 mutex_enter(&aiop->aio_cleanupq_mutex);
972 mutex_enter(&aiop->aio_mutex);
973 }
974 /* check for requests on done queue */
975 if (aiop->aio_doneq) {
976 if (model == DATAMODEL_NATIVE)
977 ucbp = (aiocb_t **)cbplist;
978 #ifdef _SYSCALL32_IMPL
979 else
980 ucbp32 = (caddr32_t *)cbplist;
981 #endif /* _SYSCALL32_IMPL */
982 for (i = 0; i < nent; i++) {
983 if (model == DATAMODEL_NATIVE) {
984 if ((cbp = *ucbp++) == NULL)
985 continue;
986 if (run_mode != AIO_LARGEFILE)
987 reqp = aio_req_done(
988 &cbp->aio_resultp);
989 else {
990 cbp64 = (aiocb64_32_t *)cbp;
991 reqp = aio_req_done(
992 &cbp64->aio_resultp);
993 }
994 }
995 #ifdef _SYSCALL32_IMPL
996 else {
997 if (run_mode == AIO_32) {
998 if ((cbp32 =
999 (aiocb32_t *)(uintptr_t)
1000 *ucbp32++) == NULL)
1001 continue;
1002 reqp = aio_req_done(
1003 &cbp32->aio_resultp);
1004 } else if (run_mode == AIO_LARGEFILE) {
1005 if ((cbp64 =
1006 (aiocb64_32_t *)(uintptr_t)
1007 *ucbp32++) == NULL)
1008 continue;
1009 reqp = aio_req_done(
1010 &cbp64->aio_resultp);
1011 }
1012
1013 }
1014 #endif /* _SYSCALL32_IMPL */
1015 if (reqp) {
1016 reqp->aio_req_next = found;
1017 found = reqp;
1018 }
1019 if (aiop->aio_doneq == NULL)
1020 break;
1021 }
1022 if (found)
1023 break;
1024 }
1025 if (aiop->aio_notifycnt > 0) {
1026 /*
1027 * nothing on the kernel's queue. the user
1028 * has notified the kernel that it has items
1029 * on a user-level queue.
1030 */
1031 aiop->aio_notifycnt--;
1032 *rval = 1;
1033 error = 0;
1034 break;
1035 }
1036 /* don't block if nothing is outstanding */
1037 if (aiop->aio_outstanding == 0) {
1038 error = EAGAIN;
1039 break;
1040 }
1041 if (blocking) {
1042 /*
1043 * drop the aio_cleanupq_mutex as we are
1044 * going to block.
1045 */
1046 mutex_exit(&aiop->aio_cleanupq_mutex);
1047 rv = cv_waituntil_sig(&aiop->aio_waitcv,
1048 &aiop->aio_mutex, rqtp, timecheck);
1049 /*
1050 * we have to drop aio_mutex and
1051 * grab it in the right order.
1052 */
1053 mutex_exit(&aiop->aio_mutex);
1054 mutex_enter(&aiop->aio_cleanupq_mutex);
1055 mutex_enter(&aiop->aio_mutex);
1056 if (rv > 0) /* check done queue again */
1057 continue;
1058 if (rv == 0) /* interrupted by a signal */
1059 error = EINTR;
1060 else /* timer expired */
1061 error = ETIME;
1062 } else {
1063 error = EAGAIN;
1064 }
1065 break;
1066 }
1067 mutex_exit(&aiop->aio_mutex);
1068 mutex_exit(&aiop->aio_cleanupq_mutex);
1069 for (reqp = found; reqp != NULL; reqp = next) {
1070 next = reqp->aio_req_next;
1071 aphysio_unlock(reqp);
1072 aio_copyout_result(reqp);
1073 mutex_enter(&aiop->aio_mutex);
1074 aio_req_free(aiop, reqp);
1075 mutex_exit(&aiop->aio_mutex);
1076 }
1077 done:
1078 kmem_free(cbplist, ssize);
1079 return (error);
1080 }
1081
1082 /*
1083 * initialize aio by allocating an aio_t struct for this
1084 * process.
1085 */
1086 static int
aioinit(void)1087 aioinit(void)
1088 {
1089 proc_t *p = curproc;
1090 aio_t *aiop;
1091 mutex_enter(&p->p_lock);
1092 if ((aiop = p->p_aio) == NULL) {
1093 aiop = aio_aiop_alloc();
1094 p->p_aio = aiop;
1095 }
1096 mutex_exit(&p->p_lock);
1097 if (aiop == NULL)
1098 return (ENOMEM);
1099 return (0);
1100 }
1101
1102 /*
1103 * start a special thread that will cleanup after aio requests
1104 * that are preventing a segment from being unmapped. as_unmap()
1105 * blocks until all phsyio to this segment is completed. this
1106 * doesn't happen until all the pages in this segment are not
1107 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
1108 * requests still outstanding. this special thread will make sure
1109 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
1110 *
1111 * this function will return an error if the process has only
1112 * one LWP. the assumption is that the caller is a separate LWP
1113 * that remains blocked in the kernel for the life of this process.
1114 */
1115 static int
aiostart(void)1116 aiostart(void)
1117 {
1118 proc_t *p = curproc;
1119 aio_t *aiop;
1120 int first, error = 0;
1121
1122 if (p->p_lwpcnt == 1)
1123 return (EDEADLK);
1124 mutex_enter(&p->p_lock);
1125 if ((aiop = p->p_aio) == NULL)
1126 error = EINVAL;
1127 else {
1128 first = aiop->aio_ok;
1129 if (aiop->aio_ok == 0)
1130 aiop->aio_ok = 1;
1131 }
1132 mutex_exit(&p->p_lock);
1133 if (error == 0 && first == 0) {
1134 return (aio_cleanup_thread(aiop));
1135 /* should return only to exit */
1136 }
1137 return (error);
1138 }
1139
1140 /*
1141 * Associate an aiocb with a port.
1142 * This function is used by aiorw() to associate a transaction with a port.
1143 * Allocate an event port structure (port_alloc_event()) and store the
1144 * delivered user pointer (portnfy_user) in the portkev_user field of the
1145 * port_kevent_t structure..
1146 * The aio_req_portkev pointer in the aio_req_t structure was added to identify
1147 * the port association.
1148 */
1149
1150 static int
aio_req_assoc_port_rw(port_notify_t * pntfy,aiocb_t * cbp,aio_req_t * reqp,int event)1151 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp,
1152 aio_req_t *reqp, int event)
1153 {
1154 port_kevent_t *pkevp = NULL;
1155 int error;
1156
1157 error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
1158 PORT_SOURCE_AIO, &pkevp);
1159 if (error) {
1160 if ((error == ENOMEM) || (error == EAGAIN))
1161 error = EAGAIN;
1162 else
1163 error = EINVAL;
1164 } else {
1165 port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
1166 aio_port_callback, reqp);
1167 pkevp->portkev_events = event;
1168 reqp->aio_req_portkev = pkevp;
1169 reqp->aio_req_port = pntfy->portnfy_port;
1170 }
1171 return (error);
1172 }
1173
1174 #ifdef _LP64
1175
1176 /*
1177 * Asynchronous list IO. A chain of aiocb's are copied in
1178 * one at a time. If the aiocb is invalid, it is skipped.
1179 * For each aiocb, the appropriate driver entry point is
1180 * called. Optimize for the common case where the list
1181 * of requests is to the same file descriptor.
1182 *
1183 * One possible optimization is to define a new driver entry
1184 * point that supports a list of IO requests. Whether this
1185 * improves performance depends somewhat on the driver's
1186 * locking strategy. Processing a list could adversely impact
1187 * the driver's interrupt latency.
1188 */
1189 static int
alio(int mode_arg,aiocb_t ** aiocb_arg,int nent,struct sigevent * sigev)1190 alio(
1191 int mode_arg,
1192 aiocb_t **aiocb_arg,
1193 int nent,
1194 struct sigevent *sigev)
1195 {
1196 file_t *fp;
1197 file_t *prev_fp = NULL;
1198 int prev_mode = -1;
1199 struct vnode *vp;
1200 aio_lio_t *head;
1201 aio_req_t *reqp;
1202 aio_t *aiop;
1203 caddr_t cbplist;
1204 aiocb_t cb;
1205 aiocb_t *aiocb = &cb;
1206 aiocb_t *cbp;
1207 aiocb_t **ucbp;
1208 struct sigevent sigevk;
1209 sigqueue_t *sqp;
1210 int (*aio_func)();
1211 int mode;
1212 int error = 0;
1213 int aio_errors = 0;
1214 int i;
1215 size_t ssize;
1216 int deadhead = 0;
1217 int aio_notsupported = 0;
1218 int lio_head_port;
1219 int aio_port;
1220 int aio_thread;
1221 port_kevent_t *pkevtp = NULL;
1222 int portused = 0;
1223 port_notify_t pnotify;
1224 int event;
1225
1226 aiop = curproc->p_aio;
1227 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1228 return (EINVAL);
1229
1230 ssize = (sizeof (aiocb_t *) * nent);
1231 cbplist = kmem_alloc(ssize, KM_SLEEP);
1232 ucbp = (aiocb_t **)cbplist;
1233
1234 if (copyin(aiocb_arg, cbplist, ssize) ||
1235 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) {
1236 kmem_free(cbplist, ssize);
1237 return (EFAULT);
1238 }
1239
1240 /* Event Ports */
1241 if (sigev &&
1242 (sigevk.sigev_notify == SIGEV_THREAD ||
1243 sigevk.sigev_notify == SIGEV_PORT)) {
1244 if (sigevk.sigev_notify == SIGEV_THREAD) {
1245 pnotify.portnfy_port = sigevk.sigev_signo;
1246 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
1247 } else if (copyin(sigevk.sigev_value.sival_ptr,
1248 &pnotify, sizeof (pnotify))) {
1249 kmem_free(cbplist, ssize);
1250 return (EFAULT);
1251 }
1252 error = port_alloc_event(pnotify.portnfy_port,
1253 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
1254 if (error) {
1255 if (error == ENOMEM || error == EAGAIN)
1256 error = EAGAIN;
1257 else
1258 error = EINVAL;
1259 kmem_free(cbplist, ssize);
1260 return (error);
1261 }
1262 lio_head_port = pnotify.portnfy_port;
1263 portused = 1;
1264 }
1265
1266 /*
1267 * a list head should be allocated if notification is
1268 * enabled for this list.
1269 */
1270 head = NULL;
1271
1272 if (mode_arg == LIO_WAIT || sigev) {
1273 mutex_enter(&aiop->aio_mutex);
1274 error = aio_lio_alloc(&head);
1275 mutex_exit(&aiop->aio_mutex);
1276 if (error)
1277 goto done;
1278 deadhead = 1;
1279 head->lio_nent = nent;
1280 head->lio_refcnt = nent;
1281 head->lio_port = -1;
1282 head->lio_portkev = NULL;
1283 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
1284 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
1285 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
1286 if (sqp == NULL) {
1287 error = EAGAIN;
1288 goto done;
1289 }
1290 sqp->sq_func = NULL;
1291 sqp->sq_next = NULL;
1292 sqp->sq_info.si_code = SI_ASYNCIO;
1293 sqp->sq_info.si_pid = curproc->p_pid;
1294 sqp->sq_info.si_ctid = PRCTID(curproc);
1295 sqp->sq_info.si_zoneid = getzoneid();
1296 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
1297 sqp->sq_info.si_signo = sigevk.sigev_signo;
1298 sqp->sq_info.si_value = sigevk.sigev_value;
1299 head->lio_sigqp = sqp;
1300 } else {
1301 head->lio_sigqp = NULL;
1302 }
1303 if (pkevtp) {
1304 /*
1305 * Prepare data to send when list of aiocb's
1306 * has completed.
1307 */
1308 port_init_event(pkevtp, (uintptr_t)sigev,
1309 (void *)(uintptr_t)pnotify.portnfy_user,
1310 NULL, head);
1311 pkevtp->portkev_events = AIOLIO;
1312 head->lio_portkev = pkevtp;
1313 head->lio_port = pnotify.portnfy_port;
1314 }
1315 }
1316
1317 for (i = 0; i < nent; i++, ucbp++) {
1318
1319 cbp = *ucbp;
1320 /* skip entry if it can't be copied. */
1321 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
1322 if (head) {
1323 mutex_enter(&aiop->aio_mutex);
1324 head->lio_nent--;
1325 head->lio_refcnt--;
1326 mutex_exit(&aiop->aio_mutex);
1327 }
1328 continue;
1329 }
1330
1331 /* skip if opcode for aiocb is LIO_NOP */
1332 mode = aiocb->aio_lio_opcode;
1333 if (mode == LIO_NOP) {
1334 cbp = NULL;
1335 if (head) {
1336 mutex_enter(&aiop->aio_mutex);
1337 head->lio_nent--;
1338 head->lio_refcnt--;
1339 mutex_exit(&aiop->aio_mutex);
1340 }
1341 continue;
1342 }
1343
1344 /* increment file descriptor's ref count. */
1345 if ((fp = getf(aiocb->aio_fildes)) == NULL) {
1346 lio_set_uerror(&cbp->aio_resultp, EBADF);
1347 if (head) {
1348 mutex_enter(&aiop->aio_mutex);
1349 head->lio_nent--;
1350 head->lio_refcnt--;
1351 mutex_exit(&aiop->aio_mutex);
1352 }
1353 aio_errors++;
1354 continue;
1355 }
1356
1357 /*
1358 * check the permission of the partition
1359 */
1360 if ((fp->f_flag & mode) == 0) {
1361 releasef(aiocb->aio_fildes);
1362 lio_set_uerror(&cbp->aio_resultp, EBADF);
1363 if (head) {
1364 mutex_enter(&aiop->aio_mutex);
1365 head->lio_nent--;
1366 head->lio_refcnt--;
1367 mutex_exit(&aiop->aio_mutex);
1368 }
1369 aio_errors++;
1370 continue;
1371 }
1372
1373 /*
1374 * common case where requests are to the same fd
1375 * for the same r/w operation.
1376 * for UFS, need to set EBADFD
1377 */
1378 vp = fp->f_vnode;
1379 if (fp != prev_fp || mode != prev_mode) {
1380 aio_func = check_vp(vp, mode);
1381 if (aio_func == NULL) {
1382 prev_fp = NULL;
1383 releasef(aiocb->aio_fildes);
1384 lio_set_uerror(&cbp->aio_resultp, EBADFD);
1385 aio_notsupported++;
1386 if (head) {
1387 mutex_enter(&aiop->aio_mutex);
1388 head->lio_nent--;
1389 head->lio_refcnt--;
1390 mutex_exit(&aiop->aio_mutex);
1391 }
1392 continue;
1393 } else {
1394 prev_fp = fp;
1395 prev_mode = mode;
1396 }
1397 }
1398
1399 error = aio_req_setup(&reqp, aiop, aiocb,
1400 &cbp->aio_resultp, vp, 0);
1401 if (error) {
1402 releasef(aiocb->aio_fildes);
1403 lio_set_uerror(&cbp->aio_resultp, error);
1404 if (head) {
1405 mutex_enter(&aiop->aio_mutex);
1406 head->lio_nent--;
1407 head->lio_refcnt--;
1408 mutex_exit(&aiop->aio_mutex);
1409 }
1410 aio_errors++;
1411 continue;
1412 }
1413
1414 reqp->aio_req_lio = head;
1415 deadhead = 0;
1416
1417 /*
1418 * Set the errno field now before sending the request to
1419 * the driver to avoid a race condition
1420 */
1421 (void) suword32(&cbp->aio_resultp.aio_errno,
1422 EINPROGRESS);
1423
1424 reqp->aio_req_iocb.iocb = (caddr_t)cbp;
1425
1426 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
1427 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
1428 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
1429 if (aio_port | aio_thread) {
1430 port_kevent_t *lpkevp;
1431 /*
1432 * Prepare data to send with each aiocb completed.
1433 */
1434 if (aio_port) {
1435 void *paddr =
1436 aiocb->aio_sigevent.sigev_value.sival_ptr;
1437 if (copyin(paddr, &pnotify, sizeof (pnotify)))
1438 error = EFAULT;
1439 } else { /* aio_thread */
1440 pnotify.portnfy_port =
1441 aiocb->aio_sigevent.sigev_signo;
1442 pnotify.portnfy_user =
1443 aiocb->aio_sigevent.sigev_value.sival_ptr;
1444 }
1445 if (error)
1446 /* EMPTY */;
1447 else if (pkevtp != NULL &&
1448 pnotify.portnfy_port == lio_head_port)
1449 error = port_dup_event(pkevtp, &lpkevp,
1450 PORT_ALLOC_DEFAULT);
1451 else
1452 error = port_alloc_event(pnotify.portnfy_port,
1453 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
1454 &lpkevp);
1455 if (error == 0) {
1456 port_init_event(lpkevp, (uintptr_t)cbp,
1457 (void *)(uintptr_t)pnotify.portnfy_user,
1458 aio_port_callback, reqp);
1459 lpkevp->portkev_events = event;
1460 reqp->aio_req_portkev = lpkevp;
1461 reqp->aio_req_port = pnotify.portnfy_port;
1462 }
1463 }
1464
1465 /*
1466 * send the request to driver.
1467 */
1468 if (error == 0) {
1469 if (aiocb->aio_nbytes == 0) {
1470 clear_active_fd(aiocb->aio_fildes);
1471 aio_zerolen(reqp);
1472 continue;
1473 }
1474 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
1475 CRED());
1476 }
1477
1478 /*
1479 * the fd's ref count is not decremented until the IO has
1480 * completed unless there was an error.
1481 */
1482 if (error) {
1483 releasef(aiocb->aio_fildes);
1484 lio_set_uerror(&cbp->aio_resultp, error);
1485 if (head) {
1486 mutex_enter(&aiop->aio_mutex);
1487 head->lio_nent--;
1488 head->lio_refcnt--;
1489 mutex_exit(&aiop->aio_mutex);
1490 }
1491 if (error == ENOTSUP)
1492 aio_notsupported++;
1493 else
1494 aio_errors++;
1495 lio_set_error(reqp, portused);
1496 } else {
1497 clear_active_fd(aiocb->aio_fildes);
1498 }
1499 }
1500
1501 if (aio_notsupported) {
1502 error = ENOTSUP;
1503 } else if (aio_errors) {
1504 /*
1505 * return EIO if any request failed
1506 */
1507 error = EIO;
1508 }
1509
1510 if (mode_arg == LIO_WAIT) {
1511 mutex_enter(&aiop->aio_mutex);
1512 while (head->lio_refcnt > 0) {
1513 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1514 mutex_exit(&aiop->aio_mutex);
1515 error = EINTR;
1516 goto done;
1517 }
1518 }
1519 mutex_exit(&aiop->aio_mutex);
1520 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
1521 }
1522
1523 done:
1524 kmem_free(cbplist, ssize);
1525 if (deadhead) {
1526 if (head->lio_sigqp)
1527 kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
1528 if (head->lio_portkev)
1529 port_free_event(head->lio_portkev);
1530 kmem_free(head, sizeof (aio_lio_t));
1531 }
1532 return (error);
1533 }
1534
1535 #endif /* _LP64 */
1536
1537 /*
1538 * Asynchronous list IO.
1539 * If list I/O is called with LIO_WAIT it can still return
1540 * before all the I/O's are completed if a signal is caught
1541 * or if the list include UFS I/O requests. If this happens,
1542 * libaio will call aliowait() to wait for the I/O's to
1543 * complete
1544 */
1545 /*ARGSUSED*/
1546 static int
aliowait(int mode,void * aiocb,int nent,void * sigev,int run_mode)1547 aliowait(
1548 int mode,
1549 void *aiocb,
1550 int nent,
1551 void *sigev,
1552 int run_mode)
1553 {
1554 aio_lio_t *head;
1555 aio_t *aiop;
1556 caddr_t cbplist;
1557 aiocb_t *cbp, **ucbp;
1558 #ifdef _SYSCALL32_IMPL
1559 aiocb32_t *cbp32;
1560 caddr32_t *ucbp32;
1561 aiocb64_32_t *cbp64;
1562 #endif
1563 int error = 0;
1564 int i;
1565 size_t ssize = 0;
1566 model_t model = get_udatamodel();
1567
1568 aiop = curproc->p_aio;
1569 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1570 return (EINVAL);
1571
1572 if (model == DATAMODEL_NATIVE)
1573 ssize = (sizeof (aiocb_t *) * nent);
1574 #ifdef _SYSCALL32_IMPL
1575 else
1576 ssize = (sizeof (caddr32_t) * nent);
1577 #endif /* _SYSCALL32_IMPL */
1578
1579 if (ssize == 0)
1580 return (EINVAL);
1581
1582 cbplist = kmem_alloc(ssize, KM_SLEEP);
1583
1584 if (model == DATAMODEL_NATIVE)
1585 ucbp = (aiocb_t **)cbplist;
1586 #ifdef _SYSCALL32_IMPL
1587 else
1588 ucbp32 = (caddr32_t *)cbplist;
1589 #endif /* _SYSCALL32_IMPL */
1590
1591 if (copyin(aiocb, cbplist, ssize)) {
1592 error = EFAULT;
1593 goto done;
1594 }
1595
1596 /*
1597 * To find the list head, we go through the
1598 * list of aiocb structs, find the request
1599 * its for, then get the list head that reqp
1600 * points to
1601 */
1602 head = NULL;
1603
1604 for (i = 0; i < nent; i++) {
1605 if (model == DATAMODEL_NATIVE) {
1606 /*
1607 * Since we are only checking for a NULL pointer
1608 * Following should work on both native data sizes
1609 * as well as for largefile aiocb.
1610 */
1611 if ((cbp = *ucbp++) == NULL)
1612 continue;
1613 if (run_mode != AIO_LARGEFILE)
1614 if (head = aio_list_get(&cbp->aio_resultp))
1615 break;
1616 else {
1617 /*
1618 * This is a case when largefile call is
1619 * made on 32 bit kernel.
1620 * Treat each pointer as pointer to
1621 * aiocb64_32
1622 */
1623 if (head = aio_list_get((aio_result_t *)
1624 &(((aiocb64_32_t *)cbp)->aio_resultp)))
1625 break;
1626 }
1627 }
1628 #ifdef _SYSCALL32_IMPL
1629 else {
1630 if (run_mode == AIO_LARGEFILE) {
1631 if ((cbp64 = (aiocb64_32_t *)
1632 (uintptr_t)*ucbp32++) == NULL)
1633 continue;
1634 if (head = aio_list_get((aio_result_t *)
1635 &cbp64->aio_resultp))
1636 break;
1637 } else if (run_mode == AIO_32) {
1638 if ((cbp32 = (aiocb32_t *)
1639 (uintptr_t)*ucbp32++) == NULL)
1640 continue;
1641 if (head = aio_list_get((aio_result_t *)
1642 &cbp32->aio_resultp))
1643 break;
1644 }
1645 }
1646 #endif /* _SYSCALL32_IMPL */
1647 }
1648
1649 if (head == NULL) {
1650 error = EINVAL;
1651 goto done;
1652 }
1653
1654 mutex_enter(&aiop->aio_mutex);
1655 while (head->lio_refcnt > 0) {
1656 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1657 mutex_exit(&aiop->aio_mutex);
1658 error = EINTR;
1659 goto done;
1660 }
1661 }
1662 mutex_exit(&aiop->aio_mutex);
1663 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
1664 done:
1665 kmem_free(cbplist, ssize);
1666 return (error);
1667 }
1668
1669 aio_lio_t *
aio_list_get(aio_result_t * resultp)1670 aio_list_get(aio_result_t *resultp)
1671 {
1672 aio_lio_t *head = NULL;
1673 aio_t *aiop;
1674 aio_req_t **bucket;
1675 aio_req_t *reqp;
1676 long index;
1677
1678 aiop = curproc->p_aio;
1679 if (aiop == NULL)
1680 return (NULL);
1681
1682 if (resultp) {
1683 index = AIO_HASH(resultp);
1684 bucket = &aiop->aio_hash[index];
1685 for (reqp = *bucket; reqp != NULL;
1686 reqp = reqp->aio_hash_next) {
1687 if (reqp->aio_req_resultp == resultp) {
1688 head = reqp->aio_req_lio;
1689 return (head);
1690 }
1691 }
1692 }
1693 return (NULL);
1694 }
1695
1696
1697 static void
lio_set_uerror(void * resultp,int error)1698 lio_set_uerror(void *resultp, int error)
1699 {
1700 /*
1701 * the resultp field is a pointer to where the
1702 * error should be written out to the user's
1703 * aiocb.
1704 *
1705 */
1706 if (get_udatamodel() == DATAMODEL_NATIVE) {
1707 (void) sulword(&((aio_result_t *)resultp)->aio_return,
1708 (ssize_t)-1);
1709 (void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1710 }
1711 #ifdef _SYSCALL32_IMPL
1712 else {
1713 (void) suword32(&((aio_result32_t *)resultp)->aio_return,
1714 (uint_t)-1);
1715 (void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1716 }
1717 #endif /* _SYSCALL32_IMPL */
1718 }
1719
1720 /*
1721 * do cleanup completion for all requests in list. memory for
1722 * each request is also freed.
1723 */
1724 static void
alio_cleanup(aio_t * aiop,aiocb_t ** cbp,int nent,int run_mode)1725 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
1726 {
1727 int i;
1728 aio_req_t *reqp;
1729 aio_result_t *resultp;
1730 aiocb64_32_t *aiocb_64;
1731
1732 for (i = 0; i < nent; i++) {
1733 if (get_udatamodel() == DATAMODEL_NATIVE) {
1734 if (cbp[i] == NULL)
1735 continue;
1736 if (run_mode == AIO_LARGEFILE) {
1737 aiocb_64 = (aiocb64_32_t *)cbp[i];
1738 resultp = (aio_result_t *)
1739 &aiocb_64->aio_resultp;
1740 } else
1741 resultp = &cbp[i]->aio_resultp;
1742 }
1743 #ifdef _SYSCALL32_IMPL
1744 else {
1745 aiocb32_t *aiocb_32;
1746 caddr32_t *cbp32;
1747
1748 cbp32 = (caddr32_t *)cbp;
1749 if (cbp32[i] == 0)
1750 continue;
1751 if (run_mode == AIO_32) {
1752 aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
1753 resultp = (aio_result_t *)&aiocb_32->
1754 aio_resultp;
1755 } else if (run_mode == AIO_LARGEFILE) {
1756 aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
1757 resultp = (aio_result_t *)&aiocb_64->
1758 aio_resultp;
1759 }
1760 }
1761 #endif /* _SYSCALL32_IMPL */
1762 /*
1763 * we need to get the aio_cleanupq_mutex since we call
1764 * aio_req_done().
1765 */
1766 mutex_enter(&aiop->aio_cleanupq_mutex);
1767 mutex_enter(&aiop->aio_mutex);
1768 reqp = aio_req_done(resultp);
1769 mutex_exit(&aiop->aio_mutex);
1770 mutex_exit(&aiop->aio_cleanupq_mutex);
1771 if (reqp != NULL) {
1772 aphysio_unlock(reqp);
1773 aio_copyout_result(reqp);
1774 mutex_enter(&aiop->aio_mutex);
1775 aio_req_free(aiop, reqp);
1776 mutex_exit(&aiop->aio_mutex);
1777 }
1778 }
1779 }
1780
1781 /*
1782 * Write out the results for an aio request that is done.
1783 */
1784 static int
aioerror(void * cb,int run_mode)1785 aioerror(void *cb, int run_mode)
1786 {
1787 aio_result_t *resultp;
1788 aio_t *aiop;
1789 aio_req_t *reqp;
1790 int retval;
1791
1792 aiop = curproc->p_aio;
1793 if (aiop == NULL || cb == NULL)
1794 return (EINVAL);
1795
1796 if (get_udatamodel() == DATAMODEL_NATIVE) {
1797 if (run_mode == AIO_LARGEFILE)
1798 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1799 aio_resultp;
1800 else
1801 resultp = &((aiocb_t *)cb)->aio_resultp;
1802 }
1803 #ifdef _SYSCALL32_IMPL
1804 else {
1805 if (run_mode == AIO_LARGEFILE)
1806 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1807 aio_resultp;
1808 else if (run_mode == AIO_32)
1809 resultp = (aio_result_t *)&((aiocb32_t *)cb)->
1810 aio_resultp;
1811 }
1812 #endif /* _SYSCALL32_IMPL */
1813 /*
1814 * we need to get the aio_cleanupq_mutex since we call
1815 * aio_req_find().
1816 */
1817 mutex_enter(&aiop->aio_cleanupq_mutex);
1818 mutex_enter(&aiop->aio_mutex);
1819 retval = aio_req_find(resultp, &reqp);
1820 mutex_exit(&aiop->aio_mutex);
1821 mutex_exit(&aiop->aio_cleanupq_mutex);
1822 if (retval == 0) {
1823 aphysio_unlock(reqp);
1824 aio_copyout_result(reqp);
1825 mutex_enter(&aiop->aio_mutex);
1826 aio_req_free(aiop, reqp);
1827 mutex_exit(&aiop->aio_mutex);
1828 return (0);
1829 } else if (retval == 1)
1830 return (EINPROGRESS);
1831 else if (retval == 2)
1832 return (EINVAL);
1833 return (0);
1834 }
1835
1836 /*
1837 * aio_cancel - if no requests outstanding,
1838 * return AIO_ALLDONE
1839 * else
1840 * return AIO_NOTCANCELED
1841 */
1842 static int
aio_cancel(int fildes,void * cb,long * rval,int run_mode)1843 aio_cancel(int fildes, void *cb, long *rval, int run_mode)
1844 {
1845 aio_t *aiop;
1846 void *resultp;
1847 int index;
1848 aio_req_t **bucket;
1849 aio_req_t *ent;
1850
1851
1852 /*
1853 * Verify valid file descriptor
1854 */
1855 if ((getf(fildes)) == NULL) {
1856 return (EBADF);
1857 }
1858 releasef(fildes);
1859
1860 aiop = curproc->p_aio;
1861 if (aiop == NULL)
1862 return (EINVAL);
1863
1864 if (aiop->aio_outstanding == 0) {
1865 *rval = AIO_ALLDONE;
1866 return (0);
1867 }
1868
1869 mutex_enter(&aiop->aio_mutex);
1870 if (cb != NULL) {
1871 if (get_udatamodel() == DATAMODEL_NATIVE) {
1872 if (run_mode == AIO_LARGEFILE)
1873 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1874 ->aio_resultp;
1875 else
1876 resultp = &((aiocb_t *)cb)->aio_resultp;
1877 }
1878 #ifdef _SYSCALL32_IMPL
1879 else {
1880 if (run_mode == AIO_LARGEFILE)
1881 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1882 ->aio_resultp;
1883 else if (run_mode == AIO_32)
1884 resultp = (aio_result_t *)&((aiocb32_t *)cb)
1885 ->aio_resultp;
1886 }
1887 #endif /* _SYSCALL32_IMPL */
1888 index = AIO_HASH(resultp);
1889 bucket = &aiop->aio_hash[index];
1890 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1891 if (ent->aio_req_resultp == resultp) {
1892 if ((ent->aio_req_flags & AIO_PENDING) == 0) {
1893 mutex_exit(&aiop->aio_mutex);
1894 *rval = AIO_ALLDONE;
1895 return (0);
1896 }
1897 mutex_exit(&aiop->aio_mutex);
1898 *rval = AIO_NOTCANCELED;
1899 return (0);
1900 }
1901 }
1902 mutex_exit(&aiop->aio_mutex);
1903 *rval = AIO_ALLDONE;
1904 return (0);
1905 }
1906
1907 for (index = 0; index < AIO_HASHSZ; index++) {
1908 bucket = &aiop->aio_hash[index];
1909 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1910 if (ent->aio_req_fd == fildes) {
1911 if ((ent->aio_req_flags & AIO_PENDING) != 0) {
1912 mutex_exit(&aiop->aio_mutex);
1913 *rval = AIO_NOTCANCELED;
1914 return (0);
1915 }
1916 }
1917 }
1918 }
1919 mutex_exit(&aiop->aio_mutex);
1920 *rval = AIO_ALLDONE;
1921 return (0);
1922 }
1923
1924 /*
1925 * solaris version of asynchronous read and write
1926 */
1927 static int
arw(int opcode,int fdes,char * bufp,int bufsize,offset_t offset,aio_result_t * resultp,int mode)1928 arw(
1929 int opcode,
1930 int fdes,
1931 char *bufp,
1932 int bufsize,
1933 offset_t offset,
1934 aio_result_t *resultp,
1935 int mode)
1936 {
1937 file_t *fp;
1938 int error;
1939 struct vnode *vp;
1940 aio_req_t *reqp;
1941 aio_t *aiop;
1942 int (*aio_func)();
1943 #ifdef _LP64
1944 aiocb_t aiocb;
1945 #else
1946 aiocb64_32_t aiocb64;
1947 #endif
1948
1949 aiop = curproc->p_aio;
1950 if (aiop == NULL)
1951 return (EINVAL);
1952
1953 if ((fp = getf(fdes)) == NULL) {
1954 return (EBADF);
1955 }
1956
1957 /*
1958 * check the permission of the partition
1959 */
1960 if ((fp->f_flag & mode) == 0) {
1961 releasef(fdes);
1962 return (EBADF);
1963 }
1964
1965 vp = fp->f_vnode;
1966 aio_func = check_vp(vp, mode);
1967 if (aio_func == NULL) {
1968 releasef(fdes);
1969 return (EBADFD);
1970 }
1971 #ifdef _LP64
1972 aiocb.aio_fildes = fdes;
1973 aiocb.aio_buf = bufp;
1974 aiocb.aio_nbytes = bufsize;
1975 aiocb.aio_offset = offset;
1976 aiocb.aio_sigevent.sigev_notify = 0;
1977 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1);
1978 #else
1979 aiocb64.aio_fildes = fdes;
1980 aiocb64.aio_buf = (caddr32_t)bufp;
1981 aiocb64.aio_nbytes = bufsize;
1982 aiocb64.aio_offset = offset;
1983 aiocb64.aio_sigevent.sigev_notify = 0;
1984 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1);
1985 #endif
1986 if (error) {
1987 releasef(fdes);
1988 return (error);
1989 }
1990
1991 /*
1992 * enable polling on this request if the opcode has
1993 * the AIO poll bit set
1994 */
1995 if (opcode & AIO_POLL_BIT)
1996 reqp->aio_req_flags |= AIO_POLL;
1997
1998 if (bufsize == 0) {
1999 clear_active_fd(fdes);
2000 aio_zerolen(reqp);
2001 return (0);
2002 }
2003 /*
2004 * send the request to driver.
2005 */
2006 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2007 /*
2008 * the fd is stored in the aio_req_t by aio_req_setup(), and
2009 * is released by the aio_cleanup_thread() when the IO has
2010 * completed.
2011 */
2012 if (error) {
2013 releasef(fdes);
2014 mutex_enter(&aiop->aio_mutex);
2015 aio_req_free(aiop, reqp);
2016 aiop->aio_pending--;
2017 if (aiop->aio_flags & AIO_REQ_BLOCK)
2018 cv_signal(&aiop->aio_cleanupcv);
2019 mutex_exit(&aiop->aio_mutex);
2020 return (error);
2021 }
2022 clear_active_fd(fdes);
2023 return (0);
2024 }
2025
2026 /*
2027 * posix version of asynchronous read and write
2028 */
2029 static int
aiorw(int opcode,void * aiocb_arg,int mode,int run_mode)2030 aiorw(
2031 int opcode,
2032 void *aiocb_arg,
2033 int mode,
2034 int run_mode)
2035 {
2036 #ifdef _SYSCALL32_IMPL
2037 aiocb32_t aiocb32;
2038 struct sigevent32 *sigev32;
2039 port_notify32_t pntfy32;
2040 #endif
2041 aiocb64_32_t aiocb64;
2042 aiocb_t aiocb;
2043 file_t *fp;
2044 int error, fd;
2045 size_t bufsize;
2046 struct vnode *vp;
2047 aio_req_t *reqp;
2048 aio_t *aiop;
2049 int (*aio_func)();
2050 aio_result_t *resultp;
2051 struct sigevent *sigev;
2052 model_t model;
2053 int aio_use_port = 0;
2054 port_notify_t pntfy;
2055
2056 model = get_udatamodel();
2057 aiop = curproc->p_aio;
2058 if (aiop == NULL)
2059 return (EINVAL);
2060
2061 if (model == DATAMODEL_NATIVE) {
2062 if (run_mode != AIO_LARGEFILE) {
2063 if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
2064 return (EFAULT);
2065 bufsize = aiocb.aio_nbytes;
2066 resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
2067 if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
2068 return (EBADF);
2069 }
2070 sigev = &aiocb.aio_sigevent;
2071 } else {
2072 /*
2073 * We come here only when we make largefile
2074 * call on 32 bit kernel using 32 bit library.
2075 */
2076 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2077 return (EFAULT);
2078 bufsize = aiocb64.aio_nbytes;
2079 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2080 ->aio_resultp);
2081 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2082 return (EBADF);
2083 sigev = (struct sigevent *)&aiocb64.aio_sigevent;
2084 }
2085
2086 if (sigev->sigev_notify == SIGEV_PORT) {
2087 if (copyin((void *)sigev->sigev_value.sival_ptr,
2088 &pntfy, sizeof (port_notify_t))) {
2089 releasef(fd);
2090 return (EFAULT);
2091 }
2092 aio_use_port = 1;
2093 } else if (sigev->sigev_notify == SIGEV_THREAD) {
2094 pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo;
2095 pntfy.portnfy_user =
2096 aiocb.aio_sigevent.sigev_value.sival_ptr;
2097 aio_use_port = 1;
2098 }
2099 }
2100 #ifdef _SYSCALL32_IMPL
2101 else {
2102 if (run_mode == AIO_32) {
2103 /* 32 bit system call is being made on 64 bit kernel */
2104 if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
2105 return (EFAULT);
2106
2107 bufsize = aiocb32.aio_nbytes;
2108 aiocb_32ton(&aiocb32, &aiocb);
2109 resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
2110 aio_resultp);
2111 if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
2112 return (EBADF);
2113 }
2114 sigev32 = &aiocb32.aio_sigevent;
2115 } else if (run_mode == AIO_LARGEFILE) {
2116 /*
2117 * We come here only when we make largefile
2118 * call on 64 bit kernel using 32 bit library.
2119 */
2120 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2121 return (EFAULT);
2122 bufsize = aiocb64.aio_nbytes;
2123 aiocb_LFton(&aiocb64, &aiocb);
2124 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2125 ->aio_resultp);
2126 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2127 return (EBADF);
2128 sigev32 = &aiocb64.aio_sigevent;
2129 }
2130
2131 if (sigev32->sigev_notify == SIGEV_PORT) {
2132 if (copyin(
2133 (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
2134 &pntfy32, sizeof (port_notify32_t))) {
2135 releasef(fd);
2136 return (EFAULT);
2137 }
2138 pntfy.portnfy_port = pntfy32.portnfy_port;
2139 pntfy.portnfy_user = (void *)(uintptr_t)
2140 pntfy32.portnfy_user;
2141 aio_use_port = 1;
2142 } else if (sigev32->sigev_notify == SIGEV_THREAD) {
2143 pntfy.portnfy_port = sigev32->sigev_signo;
2144 pntfy.portnfy_user = (void *)(uintptr_t)
2145 sigev32->sigev_value.sival_ptr;
2146 aio_use_port = 1;
2147 }
2148 }
2149 #endif /* _SYSCALL32_IMPL */
2150
2151 /*
2152 * check the permission of the partition
2153 */
2154
2155 if ((fp->f_flag & mode) == 0) {
2156 releasef(fd);
2157 return (EBADF);
2158 }
2159
2160 vp = fp->f_vnode;
2161 aio_func = check_vp(vp, mode);
2162 if (aio_func == NULL) {
2163 releasef(fd);
2164 return (EBADFD);
2165 }
2166 if (run_mode == AIO_LARGEFILE)
2167 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0);
2168 else
2169 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0);
2170
2171 if (error) {
2172 releasef(fd);
2173 return (error);
2174 }
2175 /*
2176 * enable polling on this request if the opcode has
2177 * the AIO poll bit set
2178 */
2179 if (opcode & AIO_POLL_BIT)
2180 reqp->aio_req_flags |= AIO_POLL;
2181
2182 if (model == DATAMODEL_NATIVE)
2183 reqp->aio_req_iocb.iocb = aiocb_arg;
2184 #ifdef _SYSCALL32_IMPL
2185 else
2186 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
2187 #endif
2188
2189 if (aio_use_port) {
2190 int event = (run_mode == AIO_LARGEFILE)?
2191 ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) :
2192 ((mode == FREAD)? AIOAREAD : AIOAWRITE);
2193 error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event);
2194 }
2195
2196 /*
2197 * send the request to driver.
2198 */
2199 if (error == 0) {
2200 if (bufsize == 0) {
2201 clear_active_fd(fd);
2202 aio_zerolen(reqp);
2203 return (0);
2204 }
2205 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2206 }
2207
2208 /*
2209 * the fd is stored in the aio_req_t by aio_req_setup(), and
2210 * is released by the aio_cleanup_thread() when the IO has
2211 * completed.
2212 */
2213 if (error) {
2214 releasef(fd);
2215 mutex_enter(&aiop->aio_mutex);
2216 if (aio_use_port)
2217 aio_deq(&aiop->aio_portpending, reqp);
2218 aio_req_free(aiop, reqp);
2219 aiop->aio_pending--;
2220 if (aiop->aio_flags & AIO_REQ_BLOCK)
2221 cv_signal(&aiop->aio_cleanupcv);
2222 mutex_exit(&aiop->aio_mutex);
2223 return (error);
2224 }
2225 clear_active_fd(fd);
2226 return (0);
2227 }
2228
2229
2230 /*
2231 * set error for a list IO entry that failed.
2232 */
2233 static void
lio_set_error(aio_req_t * reqp,int portused)2234 lio_set_error(aio_req_t *reqp, int portused)
2235 {
2236 aio_t *aiop = curproc->p_aio;
2237
2238 if (aiop == NULL)
2239 return;
2240
2241 mutex_enter(&aiop->aio_mutex);
2242 if (portused)
2243 aio_deq(&aiop->aio_portpending, reqp);
2244 aiop->aio_pending--;
2245 /* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
2246 reqp->aio_req_flags |= AIO_PHYSIODONE;
2247 /*
2248 * Need to free the request now as its never
2249 * going to get on the done queue
2250 *
2251 * Note: aio_outstanding is decremented in
2252 * aio_req_free()
2253 */
2254 aio_req_free(aiop, reqp);
2255 if (aiop->aio_flags & AIO_REQ_BLOCK)
2256 cv_signal(&aiop->aio_cleanupcv);
2257 mutex_exit(&aiop->aio_mutex);
2258 }
2259
2260 /*
2261 * check if a specified request is done, and remove it from
2262 * the done queue. otherwise remove anybody from the done queue
2263 * if NULL is specified.
2264 */
2265 static aio_req_t *
aio_req_done(void * resultp)2266 aio_req_done(void *resultp)
2267 {
2268 aio_req_t **bucket;
2269 aio_req_t *ent;
2270 aio_t *aiop = curproc->p_aio;
2271 long index;
2272
2273 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2274 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2275
2276 if (resultp) {
2277 index = AIO_HASH(resultp);
2278 bucket = &aiop->aio_hash[index];
2279 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2280 if (ent->aio_req_resultp == (aio_result_t *)resultp) {
2281 if (ent->aio_req_flags & AIO_DONEQ) {
2282 return (aio_req_remove(ent));
2283 }
2284 return (NULL);
2285 }
2286 }
2287 /* no match, resultp is invalid */
2288 return (NULL);
2289 }
2290 return (aio_req_remove(NULL));
2291 }
2292
2293 /*
2294 * determine if a user-level resultp pointer is associated with an
2295 * active IO request. Zero is returned when the request is done,
2296 * and the request is removed from the done queue. Only when the
2297 * return value is zero, is the "reqp" pointer valid. One is returned
2298 * when the request is inprogress. Two is returned when the request
2299 * is invalid.
2300 */
2301 static int
aio_req_find(aio_result_t * resultp,aio_req_t ** reqp)2302 aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
2303 {
2304 aio_req_t **bucket;
2305 aio_req_t *ent;
2306 aio_t *aiop = curproc->p_aio;
2307 long index;
2308
2309 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2310 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2311
2312 index = AIO_HASH(resultp);
2313 bucket = &aiop->aio_hash[index];
2314 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2315 if (ent->aio_req_resultp == resultp) {
2316 if (ent->aio_req_flags & AIO_DONEQ) {
2317 *reqp = aio_req_remove(ent);
2318 return (0);
2319 }
2320 return (1);
2321 }
2322 }
2323 /* no match, resultp is invalid */
2324 return (2);
2325 }
2326
2327 /*
2328 * remove a request from the done queue.
2329 */
2330 static aio_req_t *
aio_req_remove(aio_req_t * reqp)2331 aio_req_remove(aio_req_t *reqp)
2332 {
2333 aio_t *aiop = curproc->p_aio;
2334
2335 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2336
2337 if (reqp != NULL) {
2338 ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2339 if (reqp->aio_req_next == reqp) {
2340 /* only one request on queue */
2341 if (reqp == aiop->aio_doneq) {
2342 aiop->aio_doneq = NULL;
2343 } else {
2344 ASSERT(reqp == aiop->aio_cleanupq);
2345 aiop->aio_cleanupq = NULL;
2346 }
2347 } else {
2348 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2349 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2350 /*
2351 * The request can be either on the aio_doneq or the
2352 * aio_cleanupq
2353 */
2354 if (reqp == aiop->aio_doneq)
2355 aiop->aio_doneq = reqp->aio_req_next;
2356
2357 if (reqp == aiop->aio_cleanupq)
2358 aiop->aio_cleanupq = reqp->aio_req_next;
2359 }
2360 reqp->aio_req_flags &= ~AIO_DONEQ;
2361 reqp->aio_req_next = NULL;
2362 reqp->aio_req_prev = NULL;
2363 } else if ((reqp = aiop->aio_doneq) != NULL) {
2364 ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2365 if (reqp == reqp->aio_req_next) {
2366 /* only one request on queue */
2367 aiop->aio_doneq = NULL;
2368 } else {
2369 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2370 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2371 aiop->aio_doneq = reqp->aio_req_next;
2372 }
2373 reqp->aio_req_flags &= ~AIO_DONEQ;
2374 reqp->aio_req_next = NULL;
2375 reqp->aio_req_prev = NULL;
2376 }
2377 if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN))
2378 cv_broadcast(&aiop->aio_waitcv);
2379 return (reqp);
2380 }
2381
2382 static int
aio_req_setup(aio_req_t ** reqpp,aio_t * aiop,aiocb_t * arg,aio_result_t * resultp,vnode_t * vp,int old_solaris_req)2383 aio_req_setup(aio_req_t **reqpp, aio_t *aiop, aiocb_t *arg,
2384 aio_result_t *resultp, vnode_t *vp, int old_solaris_req)
2385 {
2386 sigqueue_t *sqp = NULL;
2387 aio_req_t *reqp;
2388 struct uio *uio;
2389 struct sigevent *sigev;
2390 int error;
2391
2392 sigev = &arg->aio_sigevent;
2393 if (sigev->sigev_notify == SIGEV_SIGNAL &&
2394 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
2395 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2396 if (sqp == NULL)
2397 return (EAGAIN);
2398 sqp->sq_func = NULL;
2399 sqp->sq_next = NULL;
2400 sqp->sq_info.si_code = SI_ASYNCIO;
2401 sqp->sq_info.si_pid = curproc->p_pid;
2402 sqp->sq_info.si_ctid = PRCTID(curproc);
2403 sqp->sq_info.si_zoneid = getzoneid();
2404 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2405 sqp->sq_info.si_signo = sigev->sigev_signo;
2406 sqp->sq_info.si_value = sigev->sigev_value;
2407 }
2408
2409 mutex_enter(&aiop->aio_mutex);
2410
2411 if (aiop->aio_flags & AIO_REQ_BLOCK) {
2412 mutex_exit(&aiop->aio_mutex);
2413 if (sqp)
2414 kmem_free(sqp, sizeof (sigqueue_t));
2415 return (EIO);
2416 }
2417 /*
2418 * get an aio_reqp from the free list or allocate one
2419 * from dynamic memory.
2420 */
2421 if (error = aio_req_alloc(&reqp, resultp)) {
2422 mutex_exit(&aiop->aio_mutex);
2423 if (sqp)
2424 kmem_free(sqp, sizeof (sigqueue_t));
2425 return (error);
2426 }
2427 aiop->aio_pending++;
2428 aiop->aio_outstanding++;
2429 reqp->aio_req_flags = AIO_PENDING;
2430 if (old_solaris_req) {
2431 /* this is an old solaris aio request */
2432 reqp->aio_req_flags |= AIO_SOLARIS;
2433 aiop->aio_flags |= AIO_SOLARIS_REQ;
2434 }
2435 if (sigev->sigev_notify == SIGEV_THREAD ||
2436 sigev->sigev_notify == SIGEV_PORT)
2437 aio_enq(&aiop->aio_portpending, reqp, 0);
2438 mutex_exit(&aiop->aio_mutex);
2439 /*
2440 * initialize aio request.
2441 */
2442 reqp->aio_req_fd = arg->aio_fildes;
2443 reqp->aio_req_sigqp = sqp;
2444 reqp->aio_req_iocb.iocb = NULL;
2445 reqp->aio_req_lio = NULL;
2446 reqp->aio_req_buf.b_file = vp;
2447 uio = reqp->aio_req.aio_uio;
2448 uio->uio_iovcnt = 1;
2449 uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
2450 uio->uio_iov->iov_len = arg->aio_nbytes;
2451 uio->uio_loffset = arg->aio_offset;
2452 *reqpp = reqp;
2453 return (0);
2454 }
2455
2456 /*
2457 * Allocate p_aio struct.
2458 */
2459 static aio_t *
aio_aiop_alloc(void)2460 aio_aiop_alloc(void)
2461 {
2462 aio_t *aiop;
2463
2464 ASSERT(MUTEX_HELD(&curproc->p_lock));
2465
2466 aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
2467 if (aiop) {
2468 mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
2469 mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
2470 NULL);
2471 mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
2472 }
2473 return (aiop);
2474 }
2475
2476 /*
2477 * Allocate an aio_req struct.
2478 */
2479 static int
aio_req_alloc(aio_req_t ** nreqp,aio_result_t * resultp)2480 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
2481 {
2482 aio_req_t *reqp;
2483 aio_t *aiop = curproc->p_aio;
2484
2485 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2486
2487 if ((reqp = aiop->aio_free) != NULL) {
2488 aiop->aio_free = reqp->aio_req_next;
2489 bzero(reqp, sizeof (*reqp));
2490 } else {
2491 /*
2492 * Check whether memory is getting tight.
2493 * This is a temporary mechanism to avoid memory
2494 * exhaustion by a single process until we come up
2495 * with a per process solution such as setrlimit().
2496 */
2497 if (freemem < desfree)
2498 return (EAGAIN);
2499 reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
2500 if (reqp == NULL)
2501 return (EAGAIN);
2502 }
2503 reqp->aio_req.aio_uio = &reqp->aio_req_uio;
2504 reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov;
2505 reqp->aio_req.aio_private = reqp;
2506 reqp->aio_req_buf.b_offset = -1;
2507 reqp->aio_req_resultp = resultp;
2508 if (aio_hash_insert(reqp, aiop)) {
2509 reqp->aio_req_next = aiop->aio_free;
2510 aiop->aio_free = reqp;
2511 return (EBUSY);
2512 }
2513 *nreqp = reqp;
2514 return (0);
2515 }
2516
2517 /*
2518 * Allocate an aio_lio_t struct.
2519 */
2520 static int
aio_lio_alloc(aio_lio_t ** head)2521 aio_lio_alloc(aio_lio_t **head)
2522 {
2523 aio_lio_t *liop;
2524 aio_t *aiop = curproc->p_aio;
2525
2526 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2527
2528 if ((liop = aiop->aio_lio_free) != NULL) {
2529 aiop->aio_lio_free = liop->lio_next;
2530 } else {
2531 /*
2532 * Check whether memory is getting tight.
2533 * This is a temporary mechanism to avoid memory
2534 * exhaustion by a single process until we come up
2535 * with a per process solution such as setrlimit().
2536 */
2537 if (freemem < desfree)
2538 return (EAGAIN);
2539
2540 liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
2541 if (liop == NULL)
2542 return (EAGAIN);
2543 }
2544 *head = liop;
2545 return (0);
2546 }
2547
2548 /*
2549 * this is a special per-process thread that is only activated if
2550 * the process is unmapping a segment with outstanding aio. normally,
2551 * the process will have completed the aio before unmapping the
2552 * segment. If the process does unmap a segment with outstanding aio,
2553 * this special thread will guarentee that the locked pages due to
2554 * aphysio() are released, thereby permitting the segment to be
2555 * unmapped. In addition to this, the cleanup thread is woken up
2556 * during DR operations to release the locked pages.
2557 */
2558
2559 static int
aio_cleanup_thread(aio_t * aiop)2560 aio_cleanup_thread(aio_t *aiop)
2561 {
2562 proc_t *p = curproc;
2563 struct as *as = p->p_as;
2564 int poked = 0;
2565 kcondvar_t *cvp;
2566 int exit_flag = 0;
2567 int rqclnup = 0;
2568
2569 sigfillset(&curthread->t_hold);
2570 sigdiffset(&curthread->t_hold, &cantmask);
2571 for (;;) {
2572 /*
2573 * if a segment is being unmapped, and the current
2574 * process's done queue is not empty, then every request
2575 * on the doneq with locked resources should be forced
2576 * to release their locks. By moving the doneq request
2577 * to the cleanupq, aio_cleanup() will process the cleanupq,
2578 * and place requests back onto the doneq. All requests
2579 * processed by aio_cleanup() will have their physical
2580 * resources unlocked.
2581 */
2582 mutex_enter(&aiop->aio_mutex);
2583 if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
2584 aiop->aio_flags |= AIO_CLEANUP;
2585 mutex_enter(&as->a_contents);
2586 if (aiop->aio_rqclnup) {
2587 aiop->aio_rqclnup = 0;
2588 rqclnup = 1;
2589 }
2590 mutex_exit(&as->a_contents);
2591 if (aiop->aio_doneq) {
2592 aio_req_t *doneqhead = aiop->aio_doneq;
2593 aiop->aio_doneq = NULL;
2594 aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
2595 }
2596 }
2597 mutex_exit(&aiop->aio_mutex);
2598 aio_cleanup(AIO_CLEANUP_THREAD);
2599 /*
2600 * thread should block on the cleanupcv while
2601 * AIO_CLEANUP is set.
2602 */
2603 cvp = &aiop->aio_cleanupcv;
2604 mutex_enter(&aiop->aio_mutex);
2605
2606 if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
2607 aiop->aio_notifyq != NULL ||
2608 aiop->aio_portcleanupq != NULL) {
2609 mutex_exit(&aiop->aio_mutex);
2610 continue;
2611 }
2612 mutex_enter(&as->a_contents);
2613
2614 /*
2615 * AIO_CLEANUP determines when the cleanup thread
2616 * should be active. This flag is set when
2617 * the cleanup thread is awakened by as_unmap() or
2618 * due to DR operations.
2619 * The flag is cleared when the blocking as_unmap()
2620 * that originally awakened us is allowed to
2621 * complete. as_unmap() blocks when trying to
2622 * unmap a segment that has SOFTLOCKed pages. when
2623 * the segment's pages are all SOFTUNLOCKed,
2624 * as->a_flags & AS_UNMAPWAIT should be zero.
2625 *
2626 * In case of cleanup request by DR, the flag is cleared
2627 * once all the pending aio requests have been processed.
2628 *
2629 * The flag shouldn't be cleared right away if the
2630 * cleanup thread was interrupted because the process
2631 * is doing forkall(). This happens when cv_wait_sig()
2632 * returns zero, because it was awakened by a pokelwps().
2633 * If the process is not exiting, it must be doing forkall().
2634 */
2635 if ((poked == 0) &&
2636 ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) ||
2637 (aiop->aio_pending == 0))) {
2638 aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
2639 cvp = &as->a_cv;
2640 rqclnup = 0;
2641 }
2642 mutex_exit(&aiop->aio_mutex);
2643 if (poked) {
2644 /*
2645 * If the process is exiting/killed, don't return
2646 * immediately without waiting for pending I/O's
2647 * and releasing the page locks.
2648 */
2649 if (p->p_flag & (SEXITLWPS|SKILLED)) {
2650 /*
2651 * If exit_flag is set, then it is
2652 * safe to exit because we have released
2653 * page locks of completed I/O's.
2654 */
2655 if (exit_flag)
2656 break;
2657
2658 mutex_exit(&as->a_contents);
2659
2660 /*
2661 * Wait for all the pending aio to complete.
2662 */
2663 mutex_enter(&aiop->aio_mutex);
2664 aiop->aio_flags |= AIO_REQ_BLOCK;
2665 while (aiop->aio_pending != 0)
2666 cv_wait(&aiop->aio_cleanupcv,
2667 &aiop->aio_mutex);
2668 mutex_exit(&aiop->aio_mutex);
2669 exit_flag = 1;
2670 continue;
2671 } else if (p->p_flag &
2672 (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
2673 /*
2674 * hold LWP until it
2675 * is continued.
2676 */
2677 mutex_exit(&as->a_contents);
2678 mutex_enter(&p->p_lock);
2679 stop(PR_SUSPENDED, SUSPEND_NORMAL);
2680 mutex_exit(&p->p_lock);
2681 poked = 0;
2682 continue;
2683 }
2684 } else {
2685 /*
2686 * When started this thread will sleep on as->a_cv.
2687 * as_unmap will awake this thread if the
2688 * segment has SOFTLOCKed pages (poked = 0).
2689 * 1. pokelwps() awakes this thread =>
2690 * break the loop to check SEXITLWPS, SHOLDFORK, etc
2691 * 2. as_unmap awakes this thread =>
2692 * to break the loop it is necessary that
2693 * - AS_UNMAPWAIT is set (as_unmap is waiting for
2694 * memory to be unlocked)
2695 * - AIO_CLEANUP is not set
2696 * (if AIO_CLEANUP is set we have to wait for
2697 * pending requests. aio_done will send a signal
2698 * for every request which completes to continue
2699 * unmapping the corresponding address range)
2700 * 3. A cleanup request will wake this thread up, ex.
2701 * by the DR operations. The aio_rqclnup flag will
2702 * be set.
2703 */
2704 while (poked == 0) {
2705 /*
2706 * The clean up requests that came in
2707 * after we had just cleaned up, couldn't
2708 * be causing the unmap thread to block - as
2709 * unmap event happened first.
2710 * Let aio_done() wake us up if it sees a need.
2711 */
2712 if (aiop->aio_rqclnup &&
2713 (aiop->aio_flags & AIO_CLEANUP) == 0)
2714 break;
2715 poked = !cv_wait_sig(cvp, &as->a_contents);
2716 if (AS_ISUNMAPWAIT(as) == 0)
2717 cv_signal(cvp);
2718 if (aiop->aio_outstanding != 0)
2719 break;
2720 }
2721 }
2722 mutex_exit(&as->a_contents);
2723 }
2724 exit:
2725 mutex_exit(&as->a_contents);
2726 ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
2727 aston(curthread); /* make thread do post_syscall */
2728 return (0);
2729 }
2730
2731 /*
2732 * save a reference to a user's outstanding aio in a hash list.
2733 */
2734 static int
aio_hash_insert(aio_req_t * aio_reqp,aio_t * aiop)2735 aio_hash_insert(
2736 aio_req_t *aio_reqp,
2737 aio_t *aiop)
2738 {
2739 long index;
2740 aio_result_t *resultp = aio_reqp->aio_req_resultp;
2741 aio_req_t *current;
2742 aio_req_t **nextp;
2743
2744 index = AIO_HASH(resultp);
2745 nextp = &aiop->aio_hash[index];
2746 while ((current = *nextp) != NULL) {
2747 if (current->aio_req_resultp == resultp)
2748 return (DUPLICATE);
2749 nextp = ¤t->aio_hash_next;
2750 }
2751 *nextp = aio_reqp;
2752 aio_reqp->aio_hash_next = NULL;
2753 return (0);
2754 }
2755
2756 static int
check_vp(struct vnode * vp,int mode)2757 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
2758 cred_t *)
2759 {
2760 struct snode *sp;
2761 dev_t dev;
2762 struct cb_ops *cb;
2763 major_t major;
2764 int (*aio_func)();
2765
2766 dev = vp->v_rdev;
2767 major = getmajor(dev);
2768
2769 /*
2770 * return NULL for requests to files and STREAMs so
2771 * that libaio takes care of them.
2772 */
2773 if (vp->v_type == VCHR) {
2774 /* no stream device for kaio */
2775 if (STREAMSTAB(major)) {
2776 return (NULL);
2777 }
2778 } else {
2779 return (NULL);
2780 }
2781
2782 /*
2783 * Check old drivers which do not have async I/O entry points.
2784 */
2785 if (devopsp[major]->devo_rev < 3)
2786 return (NULL);
2787
2788 cb = devopsp[major]->devo_cb_ops;
2789
2790 if (cb->cb_rev < 1)
2791 return (NULL);
2792
2793 /*
2794 * Check whether this device is a block device.
2795 * Kaio is not supported for devices like tty.
2796 */
2797 if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
2798 return (NULL);
2799
2800 /*
2801 * Clustering: If vnode is a PXFS vnode, then the device may be remote.
2802 * We cannot call the driver directly. Instead return the
2803 * PXFS functions.
2804 */
2805
2806 if (IS_PXFSVP(vp)) {
2807 if (mode & FREAD)
2808 return (clpxfs_aio_read);
2809 else
2810 return (clpxfs_aio_write);
2811 }
2812 if (mode & FREAD)
2813 aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
2814 else
2815 aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;
2816
2817 /*
2818 * Do we need this ?
2819 * nodev returns ENXIO anyway.
2820 */
2821 if (aio_func == nodev)
2822 return (NULL);
2823
2824 sp = VTOS(vp);
2825 smark(sp, SACC);
2826 return (aio_func);
2827 }
2828
2829 /*
2830 * Clustering: We want check_vp to return a function prototyped
2831 * correctly that will be common to both PXFS and regular case.
2832 * We define this intermediate function that will do the right
2833 * thing for driver cases.
2834 */
2835
2836 static int
driver_aio_write(vnode_t * vp,struct aio_req * aio,cred_t * cred_p)2837 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2838 {
2839 dev_t dev;
2840 struct cb_ops *cb;
2841
2842 ASSERT(vp->v_type == VCHR);
2843 ASSERT(!IS_PXFSVP(vp));
2844 dev = VTOS(vp)->s_dev;
2845 ASSERT(STREAMSTAB(getmajor(dev)) == NULL);
2846
2847 cb = devopsp[getmajor(dev)]->devo_cb_ops;
2848
2849 ASSERT(cb->cb_awrite != nodev);
2850 return ((*cb->cb_awrite)(dev, aio, cred_p));
2851 }
2852
2853 /*
2854 * Clustering: We want check_vp to return a function prototyped
2855 * correctly that will be common to both PXFS and regular case.
2856 * We define this intermediate function that will do the right
2857 * thing for driver cases.
2858 */
2859
2860 static int
driver_aio_read(vnode_t * vp,struct aio_req * aio,cred_t * cred_p)2861 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2862 {
2863 dev_t dev;
2864 struct cb_ops *cb;
2865
2866 ASSERT(vp->v_type == VCHR);
2867 ASSERT(!IS_PXFSVP(vp));
2868 dev = VTOS(vp)->s_dev;
2869 ASSERT(!STREAMSTAB(getmajor(dev)));
2870
2871 cb = devopsp[getmajor(dev)]->devo_cb_ops;
2872
2873 ASSERT(cb->cb_aread != nodev);
2874 return ((*cb->cb_aread)(dev, aio, cred_p));
2875 }
2876
2877 /*
2878 * This routine is called when a largefile call is made by a 32bit
2879 * process on a ILP32 or LP64 kernel. All 64bit processes are large
2880 * file by definition and will call alio() instead.
2881 */
2882 static int
alioLF(int mode_arg,void * aiocb_arg,int nent,void * sigev)2883 alioLF(
2884 int mode_arg,
2885 void *aiocb_arg,
2886 int nent,
2887 void *sigev)
2888 {
2889 file_t *fp;
2890 file_t *prev_fp = NULL;
2891 int prev_mode = -1;
2892 struct vnode *vp;
2893 aio_lio_t *head;
2894 aio_req_t *reqp;
2895 aio_t *aiop;
2896 caddr_t cbplist;
2897 aiocb64_32_t cb64;
2898 aiocb64_32_t *aiocb = &cb64;
2899 aiocb64_32_t *cbp;
2900 caddr32_t *ucbp;
2901 #ifdef _LP64
2902 aiocb_t aiocb_n;
2903 #endif
2904 struct sigevent32 sigevk;
2905 sigqueue_t *sqp;
2906 int (*aio_func)();
2907 int mode;
2908 int error = 0;
2909 int aio_errors = 0;
2910 int i;
2911 size_t ssize;
2912 int deadhead = 0;
2913 int aio_notsupported = 0;
2914 int lio_head_port;
2915 int aio_port;
2916 int aio_thread;
2917 port_kevent_t *pkevtp = NULL;
2918 int portused = 0;
2919 port_notify32_t pnotify;
2920 int event;
2921
2922 aiop = curproc->p_aio;
2923 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
2924 return (EINVAL);
2925
2926 ASSERT(get_udatamodel() == DATAMODEL_ILP32);
2927
2928 ssize = (sizeof (caddr32_t) * nent);
2929 cbplist = kmem_alloc(ssize, KM_SLEEP);
2930 ucbp = (caddr32_t *)cbplist;
2931
2932 if (copyin(aiocb_arg, cbplist, ssize) ||
2933 (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) {
2934 kmem_free(cbplist, ssize);
2935 return (EFAULT);
2936 }
2937
2938 /* Event Ports */
2939 if (sigev &&
2940 (sigevk.sigev_notify == SIGEV_THREAD ||
2941 sigevk.sigev_notify == SIGEV_PORT)) {
2942 if (sigevk.sigev_notify == SIGEV_THREAD) {
2943 pnotify.portnfy_port = sigevk.sigev_signo;
2944 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
2945 } else if (copyin(
2946 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
2947 &pnotify, sizeof (pnotify))) {
2948 kmem_free(cbplist, ssize);
2949 return (EFAULT);
2950 }
2951 error = port_alloc_event(pnotify.portnfy_port,
2952 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
2953 if (error) {
2954 if (error == ENOMEM || error == EAGAIN)
2955 error = EAGAIN;
2956 else
2957 error = EINVAL;
2958 kmem_free(cbplist, ssize);
2959 return (error);
2960 }
2961 lio_head_port = pnotify.portnfy_port;
2962 portused = 1;
2963 }
2964
2965 /*
2966 * a list head should be allocated if notification is
2967 * enabled for this list.
2968 */
2969 head = NULL;
2970
2971 if (mode_arg == LIO_WAIT || sigev) {
2972 mutex_enter(&aiop->aio_mutex);
2973 error = aio_lio_alloc(&head);
2974 mutex_exit(&aiop->aio_mutex);
2975 if (error)
2976 goto done;
2977 deadhead = 1;
2978 head->lio_nent = nent;
2979 head->lio_refcnt = nent;
2980 head->lio_port = -1;
2981 head->lio_portkev = NULL;
2982 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
2983 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
2984 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2985 if (sqp == NULL) {
2986 error = EAGAIN;
2987 goto done;
2988 }
2989 sqp->sq_func = NULL;
2990 sqp->sq_next = NULL;
2991 sqp->sq_info.si_code = SI_ASYNCIO;
2992 sqp->sq_info.si_pid = curproc->p_pid;
2993 sqp->sq_info.si_ctid = PRCTID(curproc);
2994 sqp->sq_info.si_zoneid = getzoneid();
2995 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2996 sqp->sq_info.si_signo = sigevk.sigev_signo;
2997 sqp->sq_info.si_value.sival_int =
2998 sigevk.sigev_value.sival_int;
2999 head->lio_sigqp = sqp;
3000 } else {
3001 head->lio_sigqp = NULL;
3002 }
3003 if (pkevtp) {
3004 /*
3005 * Prepare data to send when list of aiocb's
3006 * has completed.
3007 */
3008 port_init_event(pkevtp, (uintptr_t)sigev,
3009 (void *)(uintptr_t)pnotify.portnfy_user,
3010 NULL, head);
3011 pkevtp->portkev_events = AIOLIO64;
3012 head->lio_portkev = pkevtp;
3013 head->lio_port = pnotify.portnfy_port;
3014 }
3015 }
3016
3017 for (i = 0; i < nent; i++, ucbp++) {
3018
3019 cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
3020 /* skip entry if it can't be copied. */
3021 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
3022 if (head) {
3023 mutex_enter(&aiop->aio_mutex);
3024 head->lio_nent--;
3025 head->lio_refcnt--;
3026 mutex_exit(&aiop->aio_mutex);
3027 }
3028 continue;
3029 }
3030
3031 /* skip if opcode for aiocb is LIO_NOP */
3032 mode = aiocb->aio_lio_opcode;
3033 if (mode == LIO_NOP) {
3034 cbp = NULL;
3035 if (head) {
3036 mutex_enter(&aiop->aio_mutex);
3037 head->lio_nent--;
3038 head->lio_refcnt--;
3039 mutex_exit(&aiop->aio_mutex);
3040 }
3041 continue;
3042 }
3043
3044 /* increment file descriptor's ref count. */
3045 if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3046 lio_set_uerror(&cbp->aio_resultp, EBADF);
3047 if (head) {
3048 mutex_enter(&aiop->aio_mutex);
3049 head->lio_nent--;
3050 head->lio_refcnt--;
3051 mutex_exit(&aiop->aio_mutex);
3052 }
3053 aio_errors++;
3054 continue;
3055 }
3056
3057 /*
3058 * check the permission of the partition
3059 */
3060 if ((fp->f_flag & mode) == 0) {
3061 releasef(aiocb->aio_fildes);
3062 lio_set_uerror(&cbp->aio_resultp, EBADF);
3063 if (head) {
3064 mutex_enter(&aiop->aio_mutex);
3065 head->lio_nent--;
3066 head->lio_refcnt--;
3067 mutex_exit(&aiop->aio_mutex);
3068 }
3069 aio_errors++;
3070 continue;
3071 }
3072
3073 /*
3074 * common case where requests are to the same fd
3075 * for the same r/w operation
3076 * for UFS, need to set EBADFD
3077 */
3078 vp = fp->f_vnode;
3079 if (fp != prev_fp || mode != prev_mode) {
3080 aio_func = check_vp(vp, mode);
3081 if (aio_func == NULL) {
3082 prev_fp = NULL;
3083 releasef(aiocb->aio_fildes);
3084 lio_set_uerror(&cbp->aio_resultp, EBADFD);
3085 aio_notsupported++;
3086 if (head) {
3087 mutex_enter(&aiop->aio_mutex);
3088 head->lio_nent--;
3089 head->lio_refcnt--;
3090 mutex_exit(&aiop->aio_mutex);
3091 }
3092 continue;
3093 } else {
3094 prev_fp = fp;
3095 prev_mode = mode;
3096 }
3097 }
3098
3099 #ifdef _LP64
3100 aiocb_LFton(aiocb, &aiocb_n);
3101 error = aio_req_setup(&reqp, aiop, &aiocb_n,
3102 (aio_result_t *)&cbp->aio_resultp, vp, 0);
3103 #else
3104 error = aio_req_setupLF(&reqp, aiop, aiocb,
3105 (aio_result_t *)&cbp->aio_resultp, vp, 0);
3106 #endif /* _LP64 */
3107 if (error) {
3108 releasef(aiocb->aio_fildes);
3109 lio_set_uerror(&cbp->aio_resultp, error);
3110 if (head) {
3111 mutex_enter(&aiop->aio_mutex);
3112 head->lio_nent--;
3113 head->lio_refcnt--;
3114 mutex_exit(&aiop->aio_mutex);
3115 }
3116 aio_errors++;
3117 continue;
3118 }
3119
3120 reqp->aio_req_lio = head;
3121 deadhead = 0;
3122
3123 /*
3124 * Set the errno field now before sending the request to
3125 * the driver to avoid a race condition
3126 */
3127 (void) suword32(&cbp->aio_resultp.aio_errno,
3128 EINPROGRESS);
3129
3130 reqp->aio_req_iocb.iocb32 = *ucbp;
3131
3132 event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64;
3133 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3134 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3135 if (aio_port | aio_thread) {
3136 port_kevent_t *lpkevp;
3137 /*
3138 * Prepare data to send with each aiocb completed.
3139 */
3140 if (aio_port) {
3141 void *paddr = (void *)(uintptr_t)
3142 aiocb->aio_sigevent.sigev_value.sival_ptr;
3143 if (copyin(paddr, &pnotify, sizeof (pnotify)))
3144 error = EFAULT;
3145 } else { /* aio_thread */
3146 pnotify.portnfy_port =
3147 aiocb->aio_sigevent.sigev_signo;
3148 pnotify.portnfy_user =
3149 aiocb->aio_sigevent.sigev_value.sival_ptr;
3150 }
3151 if (error)
3152 /* EMPTY */;
3153 else if (pkevtp != NULL &&
3154 pnotify.portnfy_port == lio_head_port)
3155 error = port_dup_event(pkevtp, &lpkevp,
3156 PORT_ALLOC_DEFAULT);
3157 else
3158 error = port_alloc_event(pnotify.portnfy_port,
3159 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3160 &lpkevp);
3161 if (error == 0) {
3162 port_init_event(lpkevp, (uintptr_t)*ucbp,
3163 (void *)(uintptr_t)pnotify.portnfy_user,
3164 aio_port_callback, reqp);
3165 lpkevp->portkev_events = event;
3166 reqp->aio_req_portkev = lpkevp;
3167 reqp->aio_req_port = pnotify.portnfy_port;
3168 }
3169 }
3170
3171 /*
3172 * send the request to driver.
3173 */
3174 if (error == 0) {
3175 if (aiocb->aio_nbytes == 0) {
3176 clear_active_fd(aiocb->aio_fildes);
3177 aio_zerolen(reqp);
3178 continue;
3179 }
3180 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3181 CRED());
3182 }
3183
3184 /*
3185 * the fd's ref count is not decremented until the IO has
3186 * completed unless there was an error.
3187 */
3188 if (error) {
3189 releasef(aiocb->aio_fildes);
3190 lio_set_uerror(&cbp->aio_resultp, error);
3191 if (head) {
3192 mutex_enter(&aiop->aio_mutex);
3193 head->lio_nent--;
3194 head->lio_refcnt--;
3195 mutex_exit(&aiop->aio_mutex);
3196 }
3197 if (error == ENOTSUP)
3198 aio_notsupported++;
3199 else
3200 aio_errors++;
3201 lio_set_error(reqp, portused);
3202 } else {
3203 clear_active_fd(aiocb->aio_fildes);
3204 }
3205 }
3206
3207 if (aio_notsupported) {
3208 error = ENOTSUP;
3209 } else if (aio_errors) {
3210 /*
3211 * return EIO if any request failed
3212 */
3213 error = EIO;
3214 }
3215
3216 if (mode_arg == LIO_WAIT) {
3217 mutex_enter(&aiop->aio_mutex);
3218 while (head->lio_refcnt > 0) {
3219 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3220 mutex_exit(&aiop->aio_mutex);
3221 error = EINTR;
3222 goto done;
3223 }
3224 }
3225 mutex_exit(&aiop->aio_mutex);
3226 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
3227 }
3228
3229 done:
3230 kmem_free(cbplist, ssize);
3231 if (deadhead) {
3232 if (head->lio_sigqp)
3233 kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3234 if (head->lio_portkev)
3235 port_free_event(head->lio_portkev);
3236 kmem_free(head, sizeof (aio_lio_t));
3237 }
3238 return (error);
3239 }
3240
3241 #ifdef _SYSCALL32_IMPL
3242 static void
aiocb_LFton(aiocb64_32_t * src,aiocb_t * dest)3243 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
3244 {
3245 dest->aio_fildes = src->aio_fildes;
3246 dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
3247 dest->aio_nbytes = (size_t)src->aio_nbytes;
3248 dest->aio_offset = (off_t)src->aio_offset;
3249 dest->aio_reqprio = src->aio_reqprio;
3250 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3251 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3252
3253 /*
3254 * See comment in sigqueue32() on handling of 32-bit
3255 * sigvals in a 64-bit kernel.
3256 */
3257 dest->aio_sigevent.sigev_value.sival_int =
3258 (int)src->aio_sigevent.sigev_value.sival_int;
3259 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3260 (uintptr_t)src->aio_sigevent.sigev_notify_function;
3261 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3262 (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3263 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3264 dest->aio_lio_opcode = src->aio_lio_opcode;
3265 dest->aio_state = src->aio_state;
3266 dest->aio__pad[0] = src->aio__pad[0];
3267 }
3268 #endif
3269
3270 /*
3271 * This function is used only for largefile calls made by
3272 * 32 bit applications.
3273 */
3274 static int
aio_req_setupLF(aio_req_t ** reqpp,aio_t * aiop,aiocb64_32_t * arg,aio_result_t * resultp,vnode_t * vp,int old_solaris_req)3275 aio_req_setupLF(
3276 aio_req_t **reqpp,
3277 aio_t *aiop,
3278 aiocb64_32_t *arg,
3279 aio_result_t *resultp,
3280 vnode_t *vp,
3281 int old_solaris_req)
3282 {
3283 sigqueue_t *sqp = NULL;
3284 aio_req_t *reqp;
3285 struct uio *uio;
3286 struct sigevent32 *sigev;
3287 int error;
3288
3289 sigev = &arg->aio_sigevent;
3290 if (sigev->sigev_notify == SIGEV_SIGNAL &&
3291 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
3292 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3293 if (sqp == NULL)
3294 return (EAGAIN);
3295 sqp->sq_func = NULL;
3296 sqp->sq_next = NULL;
3297 sqp->sq_info.si_code = SI_ASYNCIO;
3298 sqp->sq_info.si_pid = curproc->p_pid;
3299 sqp->sq_info.si_ctid = PRCTID(curproc);
3300 sqp->sq_info.si_zoneid = getzoneid();
3301 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3302 sqp->sq_info.si_signo = sigev->sigev_signo;
3303 sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int;
3304 }
3305
3306 mutex_enter(&aiop->aio_mutex);
3307
3308 if (aiop->aio_flags & AIO_REQ_BLOCK) {
3309 mutex_exit(&aiop->aio_mutex);
3310 if (sqp)
3311 kmem_free(sqp, sizeof (sigqueue_t));
3312 return (EIO);
3313 }
3314 /*
3315 * get an aio_reqp from the free list or allocate one
3316 * from dynamic memory.
3317 */
3318 if (error = aio_req_alloc(&reqp, resultp)) {
3319 mutex_exit(&aiop->aio_mutex);
3320 if (sqp)
3321 kmem_free(sqp, sizeof (sigqueue_t));
3322 return (error);
3323 }
3324 aiop->aio_pending++;
3325 aiop->aio_outstanding++;
3326 reqp->aio_req_flags = AIO_PENDING;
3327 if (old_solaris_req) {
3328 /* this is an old solaris aio request */
3329 reqp->aio_req_flags |= AIO_SOLARIS;
3330 aiop->aio_flags |= AIO_SOLARIS_REQ;
3331 }
3332 if (sigev->sigev_notify == SIGEV_THREAD ||
3333 sigev->sigev_notify == SIGEV_PORT)
3334 aio_enq(&aiop->aio_portpending, reqp, 0);
3335 mutex_exit(&aiop->aio_mutex);
3336 /*
3337 * initialize aio request.
3338 */
3339 reqp->aio_req_fd = arg->aio_fildes;
3340 reqp->aio_req_sigqp = sqp;
3341 reqp->aio_req_iocb.iocb = NULL;
3342 reqp->aio_req_lio = NULL;
3343 reqp->aio_req_buf.b_file = vp;
3344 uio = reqp->aio_req.aio_uio;
3345 uio->uio_iovcnt = 1;
3346 uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
3347 uio->uio_iov->iov_len = arg->aio_nbytes;
3348 uio->uio_loffset = arg->aio_offset;
3349 *reqpp = reqp;
3350 return (0);
3351 }
3352
3353 /*
3354 * This routine is called when a non largefile call is made by a 32bit
3355 * process on a ILP32 or LP64 kernel.
3356 */
3357 static int
alio32(int mode_arg,void * aiocb_arg,int nent,void * sigev)3358 alio32(
3359 int mode_arg,
3360 void *aiocb_arg,
3361 int nent,
3362 void *sigev)
3363 {
3364 file_t *fp;
3365 file_t *prev_fp = NULL;
3366 int prev_mode = -1;
3367 struct vnode *vp;
3368 aio_lio_t *head;
3369 aio_req_t *reqp;
3370 aio_t *aiop;
3371 caddr_t cbplist;
3372 aiocb_t cb;
3373 aiocb_t *aiocb = &cb;
3374 #ifdef _LP64
3375 aiocb32_t *cbp;
3376 caddr32_t *ucbp;
3377 aiocb32_t cb32;
3378 aiocb32_t *aiocb32 = &cb32;
3379 struct sigevent32 sigevk;
3380 #else
3381 aiocb_t *cbp, **ucbp;
3382 struct sigevent sigevk;
3383 #endif
3384 sigqueue_t *sqp;
3385 int (*aio_func)();
3386 int mode;
3387 int error = 0;
3388 int aio_errors = 0;
3389 int i;
3390 size_t ssize;
3391 int deadhead = 0;
3392 int aio_notsupported = 0;
3393 int lio_head_port;
3394 int aio_port;
3395 int aio_thread;
3396 port_kevent_t *pkevtp = NULL;
3397 int portused = 0;
3398 #ifdef _LP64
3399 port_notify32_t pnotify;
3400 #else
3401 port_notify_t pnotify;
3402 #endif
3403 int event;
3404
3405 aiop = curproc->p_aio;
3406 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
3407 return (EINVAL);
3408
3409 #ifdef _LP64
3410 ssize = (sizeof (caddr32_t) * nent);
3411 #else
3412 ssize = (sizeof (aiocb_t *) * nent);
3413 #endif
3414 cbplist = kmem_alloc(ssize, KM_SLEEP);
3415 ucbp = (void *)cbplist;
3416
3417 if (copyin(aiocb_arg, cbplist, ssize) ||
3418 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) {
3419 kmem_free(cbplist, ssize);
3420 return (EFAULT);
3421 }
3422
3423 /* Event Ports */
3424 if (sigev &&
3425 (sigevk.sigev_notify == SIGEV_THREAD ||
3426 sigevk.sigev_notify == SIGEV_PORT)) {
3427 if (sigevk.sigev_notify == SIGEV_THREAD) {
3428 pnotify.portnfy_port = sigevk.sigev_signo;
3429 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
3430 } else if (copyin(
3431 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
3432 &pnotify, sizeof (pnotify))) {
3433 kmem_free(cbplist, ssize);
3434 return (EFAULT);
3435 }
3436 error = port_alloc_event(pnotify.portnfy_port,
3437 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
3438 if (error) {
3439 if (error == ENOMEM || error == EAGAIN)
3440 error = EAGAIN;
3441 else
3442 error = EINVAL;
3443 kmem_free(cbplist, ssize);
3444 return (error);
3445 }
3446 lio_head_port = pnotify.portnfy_port;
3447 portused = 1;
3448 }
3449
3450 /*
3451 * a list head should be allocated if notification is
3452 * enabled for this list.
3453 */
3454 head = NULL;
3455
3456 if (mode_arg == LIO_WAIT || sigev) {
3457 mutex_enter(&aiop->aio_mutex);
3458 error = aio_lio_alloc(&head);
3459 mutex_exit(&aiop->aio_mutex);
3460 if (error)
3461 goto done;
3462 deadhead = 1;
3463 head->lio_nent = nent;
3464 head->lio_refcnt = nent;
3465 head->lio_port = -1;
3466 head->lio_portkev = NULL;
3467 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
3468 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
3469 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3470 if (sqp == NULL) {
3471 error = EAGAIN;
3472 goto done;
3473 }
3474 sqp->sq_func = NULL;
3475 sqp->sq_next = NULL;
3476 sqp->sq_info.si_code = SI_ASYNCIO;
3477 sqp->sq_info.si_pid = curproc->p_pid;
3478 sqp->sq_info.si_ctid = PRCTID(curproc);
3479 sqp->sq_info.si_zoneid = getzoneid();
3480 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3481 sqp->sq_info.si_signo = sigevk.sigev_signo;
3482 sqp->sq_info.si_value.sival_int =
3483 sigevk.sigev_value.sival_int;
3484 head->lio_sigqp = sqp;
3485 } else {
3486 head->lio_sigqp = NULL;
3487 }
3488 if (pkevtp) {
3489 /*
3490 * Prepare data to send when list of aiocb's has
3491 * completed.
3492 */
3493 port_init_event(pkevtp, (uintptr_t)sigev,
3494 (void *)(uintptr_t)pnotify.portnfy_user,
3495 NULL, head);
3496 pkevtp->portkev_events = AIOLIO;
3497 head->lio_portkev = pkevtp;
3498 head->lio_port = pnotify.portnfy_port;
3499 }
3500 }
3501
3502 for (i = 0; i < nent; i++, ucbp++) {
3503
3504 /* skip entry if it can't be copied. */
3505 #ifdef _LP64
3506 cbp = (aiocb32_t *)(uintptr_t)*ucbp;
3507 if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32)))
3508 #else
3509 cbp = (aiocb_t *)*ucbp;
3510 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb)))
3511 #endif
3512 {
3513 if (head) {
3514 mutex_enter(&aiop->aio_mutex);
3515 head->lio_nent--;
3516 head->lio_refcnt--;
3517 mutex_exit(&aiop->aio_mutex);
3518 }
3519 continue;
3520 }
3521 #ifdef _LP64
3522 /*
3523 * copy 32 bit structure into 64 bit structure
3524 */
3525 aiocb_32ton(aiocb32, aiocb);
3526 #endif /* _LP64 */
3527
3528 /* skip if opcode for aiocb is LIO_NOP */
3529 mode = aiocb->aio_lio_opcode;
3530 if (mode == LIO_NOP) {
3531 cbp = NULL;
3532 if (head) {
3533 mutex_enter(&aiop->aio_mutex);
3534 head->lio_nent--;
3535 head->lio_refcnt--;
3536 mutex_exit(&aiop->aio_mutex);
3537 }
3538 continue;
3539 }
3540
3541 /* increment file descriptor's ref count. */
3542 if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3543 lio_set_uerror(&cbp->aio_resultp, EBADF);
3544 if (head) {
3545 mutex_enter(&aiop->aio_mutex);
3546 head->lio_nent--;
3547 head->lio_refcnt--;
3548 mutex_exit(&aiop->aio_mutex);
3549 }
3550 aio_errors++;
3551 continue;
3552 }
3553
3554 /*
3555 * check the permission of the partition
3556 */
3557 if ((fp->f_flag & mode) == 0) {
3558 releasef(aiocb->aio_fildes);
3559 lio_set_uerror(&cbp->aio_resultp, EBADF);
3560 if (head) {
3561 mutex_enter(&aiop->aio_mutex);
3562 head->lio_nent--;
3563 head->lio_refcnt--;
3564 mutex_exit(&aiop->aio_mutex);
3565 }
3566 aio_errors++;
3567 continue;
3568 }
3569
3570 /*
3571 * common case where requests are to the same fd
3572 * for the same r/w operation
3573 * for UFS, need to set EBADFD
3574 */
3575 vp = fp->f_vnode;
3576 if (fp != prev_fp || mode != prev_mode) {
3577 aio_func = check_vp(vp, mode);
3578 if (aio_func == NULL) {
3579 prev_fp = NULL;
3580 releasef(aiocb->aio_fildes);
3581 lio_set_uerror(&cbp->aio_resultp, EBADFD);
3582 aio_notsupported++;
3583 if (head) {
3584 mutex_enter(&aiop->aio_mutex);
3585 head->lio_nent--;
3586 head->lio_refcnt--;
3587 mutex_exit(&aiop->aio_mutex);
3588 }
3589 continue;
3590 } else {
3591 prev_fp = fp;
3592 prev_mode = mode;
3593 }
3594 }
3595
3596 error = aio_req_setup(&reqp, aiop, aiocb,
3597 (aio_result_t *)&cbp->aio_resultp, vp, 0);
3598 if (error) {
3599 releasef(aiocb->aio_fildes);
3600 lio_set_uerror(&cbp->aio_resultp, error);
3601 if (head) {
3602 mutex_enter(&aiop->aio_mutex);
3603 head->lio_nent--;
3604 head->lio_refcnt--;
3605 mutex_exit(&aiop->aio_mutex);
3606 }
3607 aio_errors++;
3608 continue;
3609 }
3610
3611 reqp->aio_req_lio = head;
3612 deadhead = 0;
3613
3614 /*
3615 * Set the errno field now before sending the request to
3616 * the driver to avoid a race condition
3617 */
3618 (void) suword32(&cbp->aio_resultp.aio_errno,
3619 EINPROGRESS);
3620
3621 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp;
3622
3623 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
3624 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3625 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3626 if (aio_port | aio_thread) {
3627 port_kevent_t *lpkevp;
3628 /*
3629 * Prepare data to send with each aiocb completed.
3630 */
3631 #ifdef _LP64
3632 if (aio_port) {
3633 void *paddr = (void *)(uintptr_t)
3634 aiocb32->aio_sigevent.sigev_value.sival_ptr;
3635 if (copyin(paddr, &pnotify, sizeof (pnotify)))
3636 error = EFAULT;
3637 } else { /* aio_thread */
3638 pnotify.portnfy_port =
3639 aiocb32->aio_sigevent.sigev_signo;
3640 pnotify.portnfy_user =
3641 aiocb32->aio_sigevent.sigev_value.sival_ptr;
3642 }
3643 #else
3644 if (aio_port) {
3645 void *paddr =
3646 aiocb->aio_sigevent.sigev_value.sival_ptr;
3647 if (copyin(paddr, &pnotify, sizeof (pnotify)))
3648 error = EFAULT;
3649 } else { /* aio_thread */
3650 pnotify.portnfy_port =
3651 aiocb->aio_sigevent.sigev_signo;
3652 pnotify.portnfy_user =
3653 aiocb->aio_sigevent.sigev_value.sival_ptr;
3654 }
3655 #endif
3656 if (error)
3657 /* EMPTY */;
3658 else if (pkevtp != NULL &&
3659 pnotify.portnfy_port == lio_head_port)
3660 error = port_dup_event(pkevtp, &lpkevp,
3661 PORT_ALLOC_DEFAULT);
3662 else
3663 error = port_alloc_event(pnotify.portnfy_port,
3664 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3665 &lpkevp);
3666 if (error == 0) {
3667 port_init_event(lpkevp, (uintptr_t)cbp,
3668 (void *)(uintptr_t)pnotify.portnfy_user,
3669 aio_port_callback, reqp);
3670 lpkevp->portkev_events = event;
3671 reqp->aio_req_portkev = lpkevp;
3672 reqp->aio_req_port = pnotify.portnfy_port;
3673 }
3674 }
3675
3676 /*
3677 * send the request to driver.
3678 */
3679 if (error == 0) {
3680 if (aiocb->aio_nbytes == 0) {
3681 clear_active_fd(aiocb->aio_fildes);
3682 aio_zerolen(reqp);
3683 continue;
3684 }
3685 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3686 CRED());
3687 }
3688
3689 /*
3690 * the fd's ref count is not decremented until the IO has
3691 * completed unless there was an error.
3692 */
3693 if (error) {
3694 releasef(aiocb->aio_fildes);
3695 lio_set_uerror(&cbp->aio_resultp, error);
3696 if (head) {
3697 mutex_enter(&aiop->aio_mutex);
3698 head->lio_nent--;
3699 head->lio_refcnt--;
3700 mutex_exit(&aiop->aio_mutex);
3701 }
3702 if (error == ENOTSUP)
3703 aio_notsupported++;
3704 else
3705 aio_errors++;
3706 lio_set_error(reqp, portused);
3707 } else {
3708 clear_active_fd(aiocb->aio_fildes);
3709 }
3710 }
3711
3712 if (aio_notsupported) {
3713 error = ENOTSUP;
3714 } else if (aio_errors) {
3715 /*
3716 * return EIO if any request failed
3717 */
3718 error = EIO;
3719 }
3720
3721 if (mode_arg == LIO_WAIT) {
3722 mutex_enter(&aiop->aio_mutex);
3723 while (head->lio_refcnt > 0) {
3724 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3725 mutex_exit(&aiop->aio_mutex);
3726 error = EINTR;
3727 goto done;
3728 }
3729 }
3730 mutex_exit(&aiop->aio_mutex);
3731 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32);
3732 }
3733
3734 done:
3735 kmem_free(cbplist, ssize);
3736 if (deadhead) {
3737 if (head->lio_sigqp)
3738 kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3739 if (head->lio_portkev)
3740 port_free_event(head->lio_portkev);
3741 kmem_free(head, sizeof (aio_lio_t));
3742 }
3743 return (error);
3744 }
3745
3746
3747 #ifdef _SYSCALL32_IMPL
3748 void
aiocb_32ton(aiocb32_t * src,aiocb_t * dest)3749 aiocb_32ton(aiocb32_t *src, aiocb_t *dest)
3750 {
3751 dest->aio_fildes = src->aio_fildes;
3752 dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf;
3753 dest->aio_nbytes = (size_t)src->aio_nbytes;
3754 dest->aio_offset = (off_t)src->aio_offset;
3755 dest->aio_reqprio = src->aio_reqprio;
3756 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3757 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3758
3759 /*
3760 * See comment in sigqueue32() on handling of 32-bit
3761 * sigvals in a 64-bit kernel.
3762 */
3763 dest->aio_sigevent.sigev_value.sival_int =
3764 (int)src->aio_sigevent.sigev_value.sival_int;
3765 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3766 (uintptr_t)src->aio_sigevent.sigev_notify_function;
3767 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3768 (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3769 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3770 dest->aio_lio_opcode = src->aio_lio_opcode;
3771 dest->aio_state = src->aio_state;
3772 dest->aio__pad[0] = src->aio__pad[0];
3773 }
3774 #endif /* _SYSCALL32_IMPL */
3775
3776 /*
3777 * aio_port_callback() is called just before the event is retrieved from the
3778 * port. The task of this callback function is to finish the work of the
3779 * transaction for the application, it means :
3780 * - copyout transaction data to the application
3781 * (this thread is running in the right process context)
3782 * - keep trace of the transaction (update of counters).
3783 * - free allocated buffers
3784 * The aiocb pointer is the object element of the port_kevent_t structure.
3785 *
3786 * flag :
3787 * PORT_CALLBACK_DEFAULT : do copyout and free resources
3788 * PORT_CALLBACK_CLOSE : don't do copyout, free resources
3789 */
3790
3791 /*ARGSUSED*/
3792 int
aio_port_callback(void * arg,int * events,pid_t pid,int flag,void * evp)3793 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp)
3794 {
3795 aio_t *aiop = curproc->p_aio;
3796 aio_req_t *reqp = arg;
3797 struct iovec *iov;
3798 struct buf *bp;
3799 void *resultp;
3800
3801 if (pid != curproc->p_pid) {
3802 /* wrong proc !!, can not deliver data here ... */
3803 return (EACCES);
3804 }
3805
3806 mutex_enter(&aiop->aio_portq_mutex);
3807 reqp->aio_req_portkev = NULL;
3808 aio_req_remove_portq(aiop, reqp); /* remove request from portq */
3809 mutex_exit(&aiop->aio_portq_mutex);
3810 aphysio_unlock(reqp); /* unlock used pages */
3811 mutex_enter(&aiop->aio_mutex);
3812 if (reqp->aio_req_flags & AIO_COPYOUTDONE) {
3813 aio_req_free_port(aiop, reqp); /* back to free list */
3814 mutex_exit(&aiop->aio_mutex);
3815 return (0);
3816 }
3817
3818 iov = reqp->aio_req_uio.uio_iov;
3819 bp = &reqp->aio_req_buf;
3820 resultp = (void *)reqp->aio_req_resultp;
3821 if (flag == PORT_CALLBACK_DEFAULT)
3822 aio_copyout_result_port(iov, bp, resultp);
3823 aio_req_free_port(aiop, reqp); /* request struct back to free list */
3824 mutex_exit(&aiop->aio_mutex);
3825 return (0);
3826 }
3827