1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Kernel asynchronous I/O. 31 * This is only for raw devices now (as of Nov. 1993). 32 */ 33 34 #include <sys/types.h> 35 #include <sys/errno.h> 36 #include <sys/conf.h> 37 #include <sys/file.h> 38 #include <sys/fs/snode.h> 39 #include <sys/unistd.h> 40 #include <sys/cmn_err.h> 41 #include <vm/as.h> 42 #include <vm/faultcode.h> 43 #include <sys/sysmacros.h> 44 #include <sys/procfs.h> 45 #include <sys/kmem.h> 46 #include <sys/autoconf.h> 47 #include <sys/ddi_impldefs.h> 48 #include <sys/sunddi.h> 49 #include <sys/aio_impl.h> 50 #include <sys/debug.h> 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/vmsystm.h> 54 #include <sys/fs/pxfs_ki.h> 55 #include <sys/contract/process_impl.h> 56 57 /* 58 * external entry point. 59 */ 60 #ifdef _LP64 61 static int64_t kaioc(long, long, long, long, long, long); 62 #endif 63 static int kaio(ulong_t *, rval_t *); 64 65 66 #define AIO_64 0 67 #define AIO_32 1 68 #define AIO_LARGEFILE 2 69 70 /* 71 * implementation specific functions (private) 72 */ 73 #ifdef _LP64 74 static int alio(int, aiocb_t **, int, struct sigevent *); 75 #endif 76 static int aionotify(void); 77 static int aioinit(void); 78 static int aiostart(void); 79 static void alio_cleanup(aio_t *, aiocb_t **, int, int); 80 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *, 81 cred_t *); 82 static void lio_set_error(aio_req_t *); 83 static aio_t *aio_aiop_alloc(); 84 static int aio_req_alloc(aio_req_t **, aio_result_t *); 85 static int aio_lio_alloc(aio_lio_t **); 86 static aio_req_t *aio_req_done(void *); 87 static aio_req_t *aio_req_remove(aio_req_t *); 88 static int aio_req_find(aio_result_t *, aio_req_t **); 89 static int aio_hash_insert(struct aio_req_t *, aio_t *); 90 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *, 91 aio_result_t *, vnode_t *); 92 static int aio_cleanup_thread(aio_t *); 93 static aio_lio_t *aio_list_get(aio_result_t *); 94 static void lio_set_uerror(void *, int); 95 extern void aio_zerolen(aio_req_t *); 96 static int aiowait(struct timeval *, int, long *); 97 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *); 98 static int aio_unlock_requests(caddr_t iocblist, int iocb_index, 99 aio_req_t *reqlist, aio_t *aiop, model_t model); 100 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max); 101 static int aiosuspend(void *, int, struct timespec *, int, 102 long *, int); 103 static int aliowait(int, void *, int, void *, int); 104 static int aioerror(void *, int); 105 static int aio_cancel(int, void *, long *, int); 106 static int arw(int, int, char *, int, offset_t, aio_result_t *, int); 107 static int aiorw(int, void *, int, int); 108 109 static int alioLF(int, void *, int, void *); 110 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *, 111 aio_result_t *, vnode_t *); 112 static int alio32(int, void *, int, void *); 113 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p); 114 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p); 115 116 #ifdef _SYSCALL32_IMPL 117 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *); 118 void aiocb_32ton(aiocb32_t *, aiocb_t *); 119 #endif /* _SYSCALL32_IMPL */ 120 121 /* 122 * implementation specific functions (external) 123 */ 124 void aio_req_free(aio_t *, aio_req_t *); 125 126 /* 127 * Event Port framework 128 */ 129 130 void aio_req_free_port(aio_t *, aio_req_t *); 131 static int aio_port_callback(void *, int *, pid_t, int, void *); 132 133 /* 134 * This is the loadable module wrapper. 135 */ 136 #include <sys/modctl.h> 137 #include <sys/syscall.h> 138 139 #ifdef _LP64 140 141 static struct sysent kaio_sysent = { 142 6, 143 SE_NOUNLOAD | SE_64RVAL | SE_ARGC, 144 (int (*)())kaioc 145 }; 146 147 #ifdef _SYSCALL32_IMPL 148 static struct sysent kaio_sysent32 = { 149 7, 150 SE_NOUNLOAD | SE_64RVAL, 151 kaio 152 }; 153 #endif /* _SYSCALL32_IMPL */ 154 155 #else /* _LP64 */ 156 157 static struct sysent kaio_sysent = { 158 7, 159 SE_NOUNLOAD | SE_32RVAL1, 160 kaio 161 }; 162 163 #endif /* _LP64 */ 164 165 /* 166 * Module linkage information for the kernel. 167 */ 168 169 static struct modlsys modlsys = { 170 &mod_syscallops, 171 "kernel Async I/O", 172 &kaio_sysent 173 }; 174 175 #ifdef _SYSCALL32_IMPL 176 static struct modlsys modlsys32 = { 177 &mod_syscallops32, 178 "kernel Async I/O for 32 bit compatibility", 179 &kaio_sysent32 180 }; 181 #endif /* _SYSCALL32_IMPL */ 182 183 184 static struct modlinkage modlinkage = { 185 MODREV_1, 186 &modlsys, 187 #ifdef _SYSCALL32_IMPL 188 &modlsys32, 189 #endif 190 NULL 191 }; 192 193 int 194 _init(void) 195 { 196 int retval; 197 198 if ((retval = mod_install(&modlinkage)) != 0) 199 return (retval); 200 201 return (0); 202 } 203 204 int 205 _fini(void) 206 { 207 int retval; 208 209 retval = mod_remove(&modlinkage); 210 211 return (retval); 212 } 213 214 int 215 _info(struct modinfo *modinfop) 216 { 217 return (mod_info(&modlinkage, modinfop)); 218 } 219 220 #ifdef _LP64 221 static int64_t 222 kaioc( 223 long a0, 224 long a1, 225 long a2, 226 long a3, 227 long a4, 228 long a5) 229 { 230 int error; 231 long rval = 0; 232 233 switch ((int)a0 & ~AIO_POLL_BIT) { 234 case AIOREAD: 235 error = arw((int)a0, (int)a1, (char *)a2, (int)a3, 236 (offset_t)a4, (aio_result_t *)a5, FREAD); 237 break; 238 case AIOWRITE: 239 error = arw((int)a0, (int)a1, (char *)a2, (int)a3, 240 (offset_t)a4, (aio_result_t *)a5, FWRITE); 241 break; 242 case AIOWAIT: 243 error = aiowait((struct timeval *)a1, (int)a2, &rval); 244 break; 245 case AIOWAITN: 246 error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3, 247 (timespec_t *)a4); 248 break; 249 case AIONOTIFY: 250 error = aionotify(); 251 break; 252 case AIOINIT: 253 error = aioinit(); 254 break; 255 case AIOSTART: 256 error = aiostart(); 257 break; 258 case AIOLIO: 259 error = alio((int)a1, (aiocb_t **)a2, (int)a3, 260 (struct sigevent *)a4); 261 break; 262 case AIOLIOWAIT: 263 error = aliowait((int)a1, (void *)a2, (int)a3, 264 (struct sigevent *)a4, AIO_64); 265 break; 266 case AIOSUSPEND: 267 error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3, 268 (int)a4, &rval, AIO_64); 269 break; 270 case AIOERROR: 271 error = aioerror((void *)a1, AIO_64); 272 break; 273 case AIOAREAD: 274 error = aiorw((int)a0, (void *)a1, FREAD, AIO_64); 275 break; 276 case AIOAWRITE: 277 error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64); 278 break; 279 case AIOCANCEL: 280 error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64); 281 break; 282 283 /* 284 * The large file related stuff is valid only for 285 * 32 bit kernel and not for 64 bit kernel 286 * On 64 bit kernel we convert large file calls 287 * to regular 64bit calls. 288 */ 289 290 default: 291 error = EINVAL; 292 } 293 if (error) 294 return ((int64_t)set_errno(error)); 295 return (rval); 296 } 297 #endif 298 299 static int 300 kaio( 301 ulong_t *uap, 302 rval_t *rvp) 303 { 304 long rval = 0; 305 int error = 0; 306 offset_t off; 307 308 309 rvp->r_vals = 0; 310 #if defined(_LITTLE_ENDIAN) 311 off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4]; 312 #else 313 off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5]; 314 #endif 315 316 switch (uap[0] & ~AIO_POLL_BIT) { 317 /* 318 * It must be the 32 bit system call on 64 bit kernel 319 */ 320 case AIOREAD: 321 return (arw((int)uap[0], (int)uap[1], (char *)uap[2], 322 (int)uap[3], off, (aio_result_t *)uap[6], FREAD)); 323 case AIOWRITE: 324 return (arw((int)uap[0], (int)uap[1], (char *)uap[2], 325 (int)uap[3], off, (aio_result_t *)uap[6], FWRITE)); 326 case AIOWAIT: 327 error = aiowait((struct timeval *)uap[1], (int)uap[2], 328 &rval); 329 break; 330 case AIOWAITN: 331 error = aiowaitn((void *)uap[1], (uint_t)uap[2], 332 (uint_t *)uap[3], (timespec_t *)uap[4]); 333 break; 334 case AIONOTIFY: 335 return (aionotify()); 336 case AIOINIT: 337 return (aioinit()); 338 case AIOSTART: 339 return (aiostart()); 340 case AIOLIO: 341 return (alio32((int)uap[1], (void *)uap[2], (int)uap[3], 342 (void *)uap[4])); 343 case AIOLIOWAIT: 344 return (aliowait((int)uap[1], (void *)uap[2], 345 (int)uap[3], (struct sigevent *)uap[4], AIO_32)); 346 case AIOSUSPEND: 347 error = aiosuspend((void *)uap[1], (int)uap[2], 348 (timespec_t *)uap[3], (int)uap[4], 349 &rval, AIO_32); 350 break; 351 case AIOERROR: 352 return (aioerror((void *)uap[1], AIO_32)); 353 case AIOAREAD: 354 return (aiorw((int)uap[0], (void *)uap[1], 355 FREAD, AIO_32)); 356 case AIOAWRITE: 357 return (aiorw((int)uap[0], (void *)uap[1], 358 FWRITE, AIO_32)); 359 case AIOCANCEL: 360 error = (aio_cancel((int)uap[1], (void *)uap[2], &rval, 361 AIO_32)); 362 break; 363 case AIOLIO64: 364 return (alioLF((int)uap[1], (void *)uap[2], 365 (int)uap[3], (void *)uap[4])); 366 case AIOLIOWAIT64: 367 return (aliowait(uap[1], (void *)uap[2], 368 (int)uap[3], (void *)uap[4], AIO_LARGEFILE)); 369 case AIOSUSPEND64: 370 error = aiosuspend((void *)uap[1], (int)uap[2], 371 (timespec_t *)uap[3], (int)uap[4], &rval, 372 AIO_LARGEFILE); 373 break; 374 case AIOERROR64: 375 return (aioerror((void *)uap[1], AIO_LARGEFILE)); 376 case AIOAREAD64: 377 return (aiorw((int)uap[0], (void *)uap[1], FREAD, 378 AIO_LARGEFILE)); 379 case AIOAWRITE64: 380 return (aiorw((int)uap[0], (void *)uap[1], FWRITE, 381 AIO_LARGEFILE)); 382 case AIOCANCEL64: 383 error = (aio_cancel((int)uap[1], (void *)uap[2], 384 &rval, AIO_LARGEFILE)); 385 break; 386 default: 387 return (EINVAL); 388 } 389 390 rvp->r_val1 = rval; 391 return (error); 392 } 393 394 /* 395 * wake up LWPs in this process that are sleeping in 396 * aiowait(). 397 */ 398 static int 399 aionotify(void) 400 { 401 aio_t *aiop; 402 403 aiop = curproc->p_aio; 404 if (aiop == NULL) 405 return (0); 406 407 mutex_enter(&aiop->aio_mutex); 408 aiop->aio_notifycnt++; 409 cv_broadcast(&aiop->aio_waitcv); 410 mutex_exit(&aiop->aio_mutex); 411 412 return (0); 413 } 414 415 static int 416 timeval2reltime(struct timeval *timout, timestruc_t *rqtime, 417 timestruc_t **rqtp, int *blocking) 418 { 419 #ifdef _SYSCALL32_IMPL 420 struct timeval32 wait_time_32; 421 #endif 422 struct timeval wait_time; 423 model_t model = get_udatamodel(); 424 425 *rqtp = NULL; 426 if (timout == NULL) { /* wait indefinitely */ 427 *blocking = 1; 428 return (0); 429 } 430 431 /* 432 * Need to correctly compare with the -1 passed in for a user 433 * address pointer, with both 32 bit and 64 bit apps. 434 */ 435 if (model == DATAMODEL_NATIVE) { 436 if ((intptr_t)timout == (intptr_t)-1) { /* don't wait */ 437 *blocking = 0; 438 return (0); 439 } 440 441 if (copyin(timout, &wait_time, sizeof (wait_time))) 442 return (EFAULT); 443 } 444 #ifdef _SYSCALL32_IMPL 445 else { 446 /* 447 * -1 from a 32bit app. It will not get sign extended. 448 * don't wait if -1. 449 */ 450 if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) { 451 *blocking = 0; 452 return (0); 453 } 454 455 if (copyin(timout, &wait_time_32, sizeof (wait_time_32))) 456 return (EFAULT); 457 TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32); 458 } 459 #endif /* _SYSCALL32_IMPL */ 460 461 if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) { /* don't wait */ 462 *blocking = 0; 463 return (0); 464 } 465 466 if (wait_time.tv_sec < 0 || 467 wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC) 468 return (EINVAL); 469 470 rqtime->tv_sec = wait_time.tv_sec; 471 rqtime->tv_nsec = wait_time.tv_usec * 1000; 472 *rqtp = rqtime; 473 *blocking = 1; 474 475 return (0); 476 } 477 478 static int 479 timespec2reltime(timespec_t *timout, timestruc_t *rqtime, 480 timestruc_t **rqtp, int *blocking) 481 { 482 #ifdef _SYSCALL32_IMPL 483 timespec32_t wait_time_32; 484 #endif 485 model_t model = get_udatamodel(); 486 487 *rqtp = NULL; 488 if (timout == NULL) { 489 *blocking = 1; 490 return (0); 491 } 492 493 if (model == DATAMODEL_NATIVE) { 494 if (copyin(timout, rqtime, sizeof (*rqtime))) 495 return (EFAULT); 496 } 497 #ifdef _SYSCALL32_IMPL 498 else { 499 if (copyin(timout, &wait_time_32, sizeof (wait_time_32))) 500 return (EFAULT); 501 TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32); 502 } 503 #endif /* _SYSCALL32_IMPL */ 504 505 if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) { 506 *blocking = 0; 507 return (0); 508 } 509 510 if (rqtime->tv_sec < 0 || 511 rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC) 512 return (EINVAL); 513 514 *rqtp = rqtime; 515 *blocking = 1; 516 517 return (0); 518 } 519 520 /*ARGSUSED*/ 521 static int 522 aiowait( 523 struct timeval *timout, 524 int dontblockflg, 525 long *rval) 526 { 527 int error; 528 aio_t *aiop; 529 aio_req_t *reqp; 530 clock_t status; 531 int blocking; 532 int timecheck; 533 timestruc_t rqtime; 534 timestruc_t *rqtp; 535 536 aiop = curproc->p_aio; 537 if (aiop == NULL) 538 return (EINVAL); 539 540 /* 541 * Establish the absolute future time for the timeout. 542 */ 543 error = timeval2reltime(timout, &rqtime, &rqtp, &blocking); 544 if (error) 545 return (error); 546 if (rqtp) { 547 timestruc_t now; 548 timecheck = timechanged; 549 gethrestime(&now); 550 timespecadd(rqtp, &now); 551 } 552 553 mutex_enter(&aiop->aio_mutex); 554 for (;;) { 555 /* process requests on poll queue */ 556 if (aiop->aio_pollq) { 557 mutex_exit(&aiop->aio_mutex); 558 aio_cleanup(0); 559 mutex_enter(&aiop->aio_mutex); 560 } 561 if ((reqp = aio_req_remove(NULL)) != NULL) { 562 *rval = (long)reqp->aio_req_resultp; 563 break; 564 } 565 /* user-level done queue might not be empty */ 566 if (aiop->aio_notifycnt > 0) { 567 aiop->aio_notifycnt--; 568 *rval = 1; 569 break; 570 } 571 /* don't block if no outstanding aio */ 572 if (aiop->aio_outstanding == 0 && dontblockflg) { 573 error = EINVAL; 574 break; 575 } 576 if (blocking) { 577 status = cv_waituntil_sig(&aiop->aio_waitcv, 578 &aiop->aio_mutex, rqtp, timecheck); 579 580 if (status > 0) /* check done queue again */ 581 continue; 582 if (status == 0) { /* interrupted by a signal */ 583 error = EINTR; 584 *rval = -1; 585 } else { /* timer expired */ 586 error = ETIME; 587 } 588 } 589 break; 590 } 591 mutex_exit(&aiop->aio_mutex); 592 if (reqp) { 593 aphysio_unlock(reqp); 594 aio_copyout_result(reqp); 595 mutex_enter(&aiop->aio_mutex); 596 aio_req_free(aiop, reqp); 597 mutex_exit(&aiop->aio_mutex); 598 } 599 return (error); 600 } 601 602 /* 603 * aiowaitn can be used to reap completed asynchronous requests submitted with 604 * lio_listio, aio_read or aio_write. 605 * This function only reaps asynchronous raw I/Os. 606 */ 607 608 /*ARGSUSED*/ 609 static int 610 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout) 611 { 612 int error = 0; 613 aio_t *aiop; 614 aio_req_t *reqlist = NULL; 615 caddr_t iocblist = NULL; /* array of iocb ptr's */ 616 uint_t waitcnt, cnt = 0; /* iocb cnt */ 617 size_t iocbsz; /* users iocb size */ 618 size_t riocbsz; /* returned iocb size */ 619 int iocb_index = 0; 620 model_t model = get_udatamodel(); 621 int blocking = 1; 622 int timecheck; 623 timestruc_t rqtime; 624 timestruc_t *rqtp; 625 626 aiop = curproc->p_aio; 627 if (aiop == NULL) 628 return (EINVAL); 629 630 if (aiop->aio_outstanding == 0) 631 return (EAGAIN); 632 633 if (copyin(nwait, &waitcnt, sizeof (uint_t))) 634 return (EFAULT); 635 636 /* set *nwait to zero, if we must return prematurely */ 637 if (copyout(&cnt, nwait, sizeof (uint_t))) 638 return (EFAULT); 639 640 if (waitcnt == 0) { 641 blocking = 0; 642 rqtp = NULL; 643 waitcnt = nent; 644 } else { 645 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking); 646 if (error) 647 return (error); 648 } 649 650 if (model == DATAMODEL_NATIVE) 651 iocbsz = (sizeof (aiocb_t *) * nent); 652 #ifdef _SYSCALL32_IMPL 653 else 654 iocbsz = (sizeof (caddr32_t) * nent); 655 #endif /* _SYSCALL32_IMPL */ 656 657 /* 658 * Only one aio_waitn call is allowed at a time. 659 * The active aio_waitn will collect all requests 660 * out of the "done" list and if necessary it will wait 661 * for some/all pending requests to fulfill the nwait 662 * parameter. 663 * A second or further aio_waitn calls will sleep here 664 * until the active aio_waitn finishes and leaves the kernel 665 * If the second call does not block (poll), then return 666 * immediately with the error code : EAGAIN. 667 * If the second call should block, then sleep here, but 668 * do not touch the timeout. The timeout starts when this 669 * aio_waitn-call becomes active. 670 */ 671 672 mutex_enter(&aiop->aio_mutex); 673 674 while (aiop->aio_flags & AIO_WAITN) { 675 if (blocking == 0) { 676 mutex_exit(&aiop->aio_mutex); 677 return (EAGAIN); 678 } 679 680 /* block, no timeout */ 681 aiop->aio_flags |= AIO_WAITN_PENDING; 682 if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) { 683 mutex_exit(&aiop->aio_mutex); 684 return (EINTR); 685 } 686 } 687 688 /* 689 * Establish the absolute future time for the timeout. 690 */ 691 if (rqtp) { 692 timestruc_t now; 693 timecheck = timechanged; 694 gethrestime(&now); 695 timespecadd(rqtp, &now); 696 } 697 698 if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) { 699 kmem_free(aiop->aio_iocb, aiop->aio_iocbsz); 700 aiop->aio_iocb = NULL; 701 } 702 703 if (aiop->aio_iocb == NULL) { 704 iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP); 705 if (iocblist == NULL) { 706 mutex_exit(&aiop->aio_mutex); 707 return (ENOMEM); 708 } 709 aiop->aio_iocb = (aiocb_t **)iocblist; 710 aiop->aio_iocbsz = iocbsz; 711 } else { 712 iocblist = (char *)aiop->aio_iocb; 713 } 714 715 aiop->aio_waitncnt = waitcnt; 716 aiop->aio_flags |= AIO_WAITN; 717 718 for (;;) { 719 /* push requests on poll queue to done queue */ 720 if (aiop->aio_pollq) { 721 mutex_exit(&aiop->aio_mutex); 722 aio_cleanup(0); 723 mutex_enter(&aiop->aio_mutex); 724 } 725 726 /* check for requests on done queue */ 727 if (aiop->aio_doneq) { 728 cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt); 729 aiop->aio_waitncnt = waitcnt - cnt; 730 } 731 732 /* user-level done queue might not be empty */ 733 if (aiop->aio_notifycnt > 0) { 734 aiop->aio_notifycnt--; 735 error = 0; 736 break; 737 } 738 739 /* 740 * if we are here second time as a result of timer 741 * expiration, we reset error if there are enough 742 * aiocb's to satisfy request. 743 * We return also if all requests are already done 744 * and we picked up the whole done queue. 745 */ 746 747 if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 && 748 aiop->aio_doneq == NULL)) { 749 error = 0; 750 break; 751 } 752 753 if ((cnt < waitcnt) && blocking) { 754 int rval = cv_waituntil_sig(&aiop->aio_waitcv, 755 &aiop->aio_mutex, rqtp, timecheck); 756 if (rval > 0) 757 continue; 758 if (rval < 0) { 759 error = ETIME; 760 blocking = 0; 761 continue; 762 } 763 error = EINTR; 764 } 765 break; 766 } 767 768 mutex_exit(&aiop->aio_mutex); 769 770 if (cnt > 0) { 771 772 iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist, 773 aiop, model); 774 775 if (model == DATAMODEL_NATIVE) 776 riocbsz = (sizeof (aiocb_t *) * cnt); 777 #ifdef _SYSCALL32_IMPL 778 else 779 riocbsz = (sizeof (caddr32_t) * cnt); 780 #endif /* _SYSCALL32_IMPL */ 781 782 if (copyout(iocblist, uiocb, riocbsz) || 783 copyout(&cnt, nwait, sizeof (uint_t))) 784 error = EFAULT; 785 } 786 787 if (aiop->aio_iocbsz > AIO_IOCB_MAX) { 788 kmem_free(iocblist, aiop->aio_iocbsz); 789 aiop->aio_iocb = NULL; 790 } 791 792 /* check if there is another thread waiting for execution */ 793 mutex_enter(&aiop->aio_mutex); 794 aiop->aio_flags &= ~AIO_WAITN; 795 if (aiop->aio_flags & AIO_WAITN_PENDING) { 796 aiop->aio_flags &= ~AIO_WAITN_PENDING; 797 cv_signal(&aiop->aio_waitncv); 798 } 799 mutex_exit(&aiop->aio_mutex); 800 801 return (error); 802 } 803 804 /* 805 * aio_unlock_requests 806 * copyouts the result of the request as well as the return value. 807 * It builds the list of completed asynchronous requests, 808 * unlocks the allocated memory ranges and 809 * put the aio request structure back into the free list. 810 */ 811 812 static int 813 aio_unlock_requests( 814 caddr_t iocblist, 815 int iocb_index, 816 aio_req_t *reqlist, 817 aio_t *aiop, 818 model_t model) 819 { 820 aio_req_t *reqp, *nreqp; 821 822 if (model == DATAMODEL_NATIVE) { 823 for (reqp = reqlist; reqp != NULL; reqp = nreqp) { 824 (((caddr_t *)iocblist)[iocb_index++]) = 825 reqp->aio_req_iocb.iocb; 826 nreqp = reqp->aio_req_next; 827 aphysio_unlock(reqp); 828 aio_copyout_result(reqp); 829 mutex_enter(&aiop->aio_mutex); 830 aio_req_free(aiop, reqp); 831 mutex_exit(&aiop->aio_mutex); 832 } 833 } 834 #ifdef _SYSCALL32_IMPL 835 else { 836 for (reqp = reqlist; reqp != NULL; reqp = nreqp) { 837 ((caddr32_t *)iocblist)[iocb_index++] = 838 reqp->aio_req_iocb.iocb32; 839 nreqp = reqp->aio_req_next; 840 aphysio_unlock(reqp); 841 aio_copyout_result(reqp); 842 mutex_enter(&aiop->aio_mutex); 843 aio_req_free(aiop, reqp); 844 mutex_exit(&aiop->aio_mutex); 845 } 846 } 847 #endif /* _SYSCALL32_IMPL */ 848 return (iocb_index); 849 } 850 851 /* 852 * aio_reqlist_concat 853 * moves "max" elements from the done queue to the reqlist queue and removes 854 * the AIO_DONEQ flag. 855 * - reqlist queue is a simple linked list 856 * - done queue is a double linked list 857 */ 858 859 static int 860 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max) 861 { 862 aio_req_t *q2, *q2work, *list; 863 int count = 0; 864 865 list = *reqlist; 866 q2 = aiop->aio_doneq; 867 q2work = q2; 868 while (max-- > 0) { 869 q2work->aio_req_flags &= ~AIO_DONEQ; 870 q2work = q2work->aio_req_next; 871 count++; 872 if (q2work == q2) 873 break; 874 } 875 876 if (q2work == q2) { 877 /* all elements revised */ 878 q2->aio_req_prev->aio_req_next = list; 879 list = q2; 880 aiop->aio_doneq = NULL; 881 } else { 882 /* 883 * max < elements in the doneq 884 * detach only the required amount of elements 885 * out of the doneq 886 */ 887 q2work->aio_req_prev->aio_req_next = list; 888 list = q2; 889 890 aiop->aio_doneq = q2work; 891 q2work->aio_req_prev = q2->aio_req_prev; 892 q2->aio_req_prev->aio_req_next = q2work; 893 } 894 *reqlist = list; 895 return (count); 896 } 897 898 /*ARGSUSED*/ 899 static int 900 aiosuspend( 901 void *aiocb, 902 int nent, 903 struct timespec *timout, 904 int flag, 905 long *rval, 906 int run_mode) 907 { 908 int error; 909 aio_t *aiop; 910 aio_req_t *reqp, *found, *next; 911 caddr_t cbplist = NULL; 912 aiocb_t *cbp, **ucbp; 913 #ifdef _SYSCALL32_IMPL 914 aiocb32_t *cbp32; 915 caddr32_t *ucbp32; 916 #endif /* _SYSCALL32_IMPL */ 917 aiocb64_32_t *cbp64; 918 int rv; 919 int i; 920 size_t ssize; 921 model_t model = get_udatamodel(); 922 int blocking; 923 int timecheck; 924 timestruc_t rqtime; 925 timestruc_t *rqtp; 926 927 aiop = curproc->p_aio; 928 if (aiop == NULL || nent <= 0) 929 return (EINVAL); 930 931 /* 932 * Establish the absolute future time for the timeout. 933 */ 934 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking); 935 if (error) 936 return (error); 937 if (rqtp) { 938 timestruc_t now; 939 timecheck = timechanged; 940 gethrestime(&now); 941 timespecadd(rqtp, &now); 942 } 943 944 /* 945 * If we are not blocking and there's no IO complete 946 * skip aiocb copyin. 947 */ 948 if (!blocking && (aiop->aio_pollq == NULL) && 949 (aiop->aio_doneq == NULL)) { 950 return (EAGAIN); 951 } 952 953 if (model == DATAMODEL_NATIVE) 954 ssize = (sizeof (aiocb_t *) * nent); 955 #ifdef _SYSCALL32_IMPL 956 else 957 ssize = (sizeof (caddr32_t) * nent); 958 #endif /* _SYSCALL32_IMPL */ 959 960 cbplist = kmem_alloc(ssize, KM_NOSLEEP); 961 if (cbplist == NULL) 962 return (ENOMEM); 963 964 if (copyin(aiocb, cbplist, ssize)) { 965 error = EFAULT; 966 goto done; 967 } 968 969 found = NULL; 970 /* 971 * we need to get the aio_cleanupq_mutex since we call 972 * aio_req_done(). 973 */ 974 mutex_enter(&aiop->aio_cleanupq_mutex); 975 mutex_enter(&aiop->aio_mutex); 976 for (;;) { 977 /* push requests on poll queue to done queue */ 978 if (aiop->aio_pollq) { 979 mutex_exit(&aiop->aio_mutex); 980 mutex_exit(&aiop->aio_cleanupq_mutex); 981 aio_cleanup(0); 982 mutex_enter(&aiop->aio_cleanupq_mutex); 983 mutex_enter(&aiop->aio_mutex); 984 } 985 /* check for requests on done queue */ 986 if (aiop->aio_doneq) { 987 if (model == DATAMODEL_NATIVE) 988 ucbp = (aiocb_t **)cbplist; 989 #ifdef _SYSCALL32_IMPL 990 else 991 ucbp32 = (caddr32_t *)cbplist; 992 #endif /* _SYSCALL32_IMPL */ 993 for (i = 0; i < nent; i++) { 994 if (model == DATAMODEL_NATIVE) { 995 if ((cbp = *ucbp++) == NULL) 996 continue; 997 if (run_mode != AIO_LARGEFILE) 998 reqp = aio_req_done( 999 &cbp->aio_resultp); 1000 else { 1001 cbp64 = (aiocb64_32_t *)cbp; 1002 reqp = aio_req_done( 1003 &cbp64->aio_resultp); 1004 } 1005 } 1006 #ifdef _SYSCALL32_IMPL 1007 else { 1008 if (run_mode == AIO_32) { 1009 if ((cbp32 = 1010 (aiocb32_t *)(uintptr_t) 1011 *ucbp32++) == NULL) 1012 continue; 1013 reqp = aio_req_done( 1014 &cbp32->aio_resultp); 1015 } else if (run_mode == AIO_LARGEFILE) { 1016 if ((cbp64 = 1017 (aiocb64_32_t *)(uintptr_t) 1018 *ucbp32++) == NULL) 1019 continue; 1020 reqp = aio_req_done( 1021 &cbp64->aio_resultp); 1022 } 1023 1024 } 1025 #endif /* _SYSCALL32_IMPL */ 1026 if (reqp) { 1027 reqp->aio_req_next = found; 1028 found = reqp; 1029 } 1030 if (aiop->aio_doneq == NULL) 1031 break; 1032 } 1033 if (found) 1034 break; 1035 } 1036 if (aiop->aio_notifycnt > 0) { 1037 /* 1038 * nothing on the kernel's queue. the user 1039 * has notified the kernel that it has items 1040 * on a user-level queue. 1041 */ 1042 aiop->aio_notifycnt--; 1043 *rval = 1; 1044 error = 0; 1045 break; 1046 } 1047 /* don't block if nothing is outstanding */ 1048 if (aiop->aio_outstanding == 0) { 1049 error = EAGAIN; 1050 break; 1051 } 1052 if (blocking) { 1053 /* 1054 * drop the aio_cleanupq_mutex as we are 1055 * going to block. 1056 */ 1057 mutex_exit(&aiop->aio_cleanupq_mutex); 1058 rv = cv_waituntil_sig(&aiop->aio_waitcv, 1059 &aiop->aio_mutex, rqtp, timecheck); 1060 /* 1061 * we have to drop aio_mutex and 1062 * grab it in the right order. 1063 */ 1064 mutex_exit(&aiop->aio_mutex); 1065 mutex_enter(&aiop->aio_cleanupq_mutex); 1066 mutex_enter(&aiop->aio_mutex); 1067 if (rv > 0) /* check done queue again */ 1068 continue; 1069 if (rv == 0) /* interrupted by a signal */ 1070 error = EINTR; 1071 else /* timer expired */ 1072 error = ETIME; 1073 } else { 1074 error = EAGAIN; 1075 } 1076 break; 1077 } 1078 mutex_exit(&aiop->aio_mutex); 1079 mutex_exit(&aiop->aio_cleanupq_mutex); 1080 for (reqp = found; reqp != NULL; reqp = next) { 1081 next = reqp->aio_req_next; 1082 aphysio_unlock(reqp); 1083 aio_copyout_result(reqp); 1084 mutex_enter(&aiop->aio_mutex); 1085 aio_req_free(aiop, reqp); 1086 mutex_exit(&aiop->aio_mutex); 1087 } 1088 done: 1089 kmem_free(cbplist, ssize); 1090 return (error); 1091 } 1092 1093 /* 1094 * initialize aio by allocating an aio_t struct for this 1095 * process. 1096 */ 1097 static int 1098 aioinit(void) 1099 { 1100 proc_t *p = curproc; 1101 aio_t *aiop; 1102 mutex_enter(&p->p_lock); 1103 if ((aiop = p->p_aio) == NULL) { 1104 aiop = aio_aiop_alloc(); 1105 p->p_aio = aiop; 1106 } 1107 mutex_exit(&p->p_lock); 1108 if (aiop == NULL) 1109 return (ENOMEM); 1110 return (0); 1111 } 1112 1113 /* 1114 * start a special thread that will cleanup after aio requests 1115 * that are preventing a segment from being unmapped. as_unmap() 1116 * blocks until all phsyio to this segment is completed. this 1117 * doesn't happen until all the pages in this segment are not 1118 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio 1119 * requests still outstanding. this special thread will make sure 1120 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed. 1121 * 1122 * this function will return an error if the process has only 1123 * one LWP. the assumption is that the caller is a separate LWP 1124 * that remains blocked in the kernel for the life of this process. 1125 */ 1126 static int 1127 aiostart(void) 1128 { 1129 proc_t *p = curproc; 1130 aio_t *aiop; 1131 int first, error = 0; 1132 1133 if (p->p_lwpcnt == 1) 1134 return (EDEADLK); 1135 mutex_enter(&p->p_lock); 1136 if ((aiop = p->p_aio) == NULL) 1137 error = EINVAL; 1138 else { 1139 first = aiop->aio_ok; 1140 if (aiop->aio_ok == 0) 1141 aiop->aio_ok = 1; 1142 } 1143 mutex_exit(&p->p_lock); 1144 if (error == 0 && first == 0) { 1145 return (aio_cleanup_thread(aiop)); 1146 /* should return only to exit */ 1147 } 1148 return (error); 1149 } 1150 1151 /* 1152 * Associate an aiocb with a port. 1153 * This function is used by aiorw() to associate a transaction with a port. 1154 * Allocate an event port structure (port_alloc_event()) and store the 1155 * delivered user pointer (portnfy_user) in the portkev_user field of the 1156 * port_kevent_t structure.. 1157 * The aio_req_portkev pointer in the aio_req_t structure was added to identify 1158 * the port association. 1159 */ 1160 1161 static int 1162 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp, 1163 aio_req_t *reqp, int event) 1164 { 1165 port_kevent_t *pkevp = NULL; 1166 int error; 1167 1168 error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT, 1169 PORT_SOURCE_AIO, &pkevp); 1170 if (error) { 1171 if ((error == ENOMEM) || (error == EAGAIN)) 1172 error = EAGAIN; 1173 else 1174 error = EINVAL; 1175 } else { 1176 port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user, 1177 aio_port_callback, reqp); 1178 pkevp->portkev_events = event; 1179 reqp->aio_req_portkev = pkevp; 1180 reqp->aio_req_port = pntfy->portnfy_port; 1181 } 1182 return (error); 1183 } 1184 1185 #ifdef _LP64 1186 1187 /* 1188 * Asynchronous list IO. A chain of aiocb's are copied in 1189 * one at a time. If the aiocb is invalid, it is skipped. 1190 * For each aiocb, the appropriate driver entry point is 1191 * called. Optimize for the common case where the list 1192 * of requests is to the same file descriptor. 1193 * 1194 * One possible optimization is to define a new driver entry 1195 * point that supports a list of IO requests. Whether this 1196 * improves performance depends somewhat on the driver's 1197 * locking strategy. Processing a list could adversely impact 1198 * the driver's interrupt latency. 1199 */ 1200 static int 1201 alio( 1202 int mode_arg, 1203 aiocb_t **aiocb_arg, 1204 int nent, 1205 struct sigevent *sigev) 1206 { 1207 file_t *fp; 1208 file_t *prev_fp = NULL; 1209 int prev_mode = -1; 1210 struct vnode *vp; 1211 aio_lio_t *head; 1212 aio_req_t *reqp; 1213 aio_t *aiop; 1214 caddr_t cbplist; 1215 aiocb_t cb; 1216 aiocb_t *aiocb = &cb; 1217 aiocb_t *cbp; 1218 aiocb_t **ucbp; 1219 struct sigevent sigevk; 1220 sigqueue_t *sqp; 1221 int (*aio_func)(); 1222 int mode; 1223 int error = 0; 1224 int aio_errors = 0; 1225 int i; 1226 size_t ssize; 1227 int deadhead = 0; 1228 int aio_notsupported = 0; 1229 int lio_head_port; 1230 int aio_port; 1231 int aio_thread; 1232 port_kevent_t *pkevtp = NULL; 1233 port_notify_t pnotify; 1234 int event; 1235 1236 aiop = curproc->p_aio; 1237 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 1238 return (EINVAL); 1239 1240 ssize = (sizeof (aiocb_t *) * nent); 1241 cbplist = kmem_alloc(ssize, KM_SLEEP); 1242 ucbp = (aiocb_t **)cbplist; 1243 1244 if (copyin(aiocb_arg, cbplist, ssize) || 1245 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) { 1246 kmem_free(cbplist, ssize); 1247 return (EFAULT); 1248 } 1249 1250 /* Event Ports */ 1251 if (sigev && 1252 (sigevk.sigev_notify == SIGEV_THREAD || 1253 sigevk.sigev_notify == SIGEV_PORT)) { 1254 if (sigevk.sigev_notify == SIGEV_THREAD) { 1255 pnotify.portnfy_port = sigevk.sigev_signo; 1256 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 1257 } else if (copyin(sigevk.sigev_value.sival_ptr, 1258 &pnotify, sizeof (pnotify))) { 1259 kmem_free(cbplist, ssize); 1260 return (EFAULT); 1261 } 1262 error = port_alloc_event(pnotify.portnfy_port, 1263 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 1264 if (error) { 1265 if (error == ENOMEM || error == EAGAIN) 1266 error = EAGAIN; 1267 else 1268 error = EINVAL; 1269 kmem_free(cbplist, ssize); 1270 return (error); 1271 } 1272 lio_head_port = pnotify.portnfy_port; 1273 } 1274 1275 /* 1276 * a list head should be allocated if notification is 1277 * enabled for this list. 1278 */ 1279 head = NULL; 1280 1281 if (mode_arg == LIO_WAIT || sigev) { 1282 mutex_enter(&aiop->aio_mutex); 1283 error = aio_lio_alloc(&head); 1284 mutex_exit(&aiop->aio_mutex); 1285 if (error) 1286 goto done; 1287 deadhead = 1; 1288 head->lio_nent = nent; 1289 head->lio_refcnt = nent; 1290 head->lio_port = -1; 1291 head->lio_portkev = NULL; 1292 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 1293 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 1294 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 1295 if (sqp == NULL) { 1296 error = EAGAIN; 1297 goto done; 1298 } 1299 sqp->sq_func = NULL; 1300 sqp->sq_next = NULL; 1301 sqp->sq_info.si_code = SI_ASYNCIO; 1302 sqp->sq_info.si_pid = curproc->p_pid; 1303 sqp->sq_info.si_ctid = PRCTID(curproc); 1304 sqp->sq_info.si_zoneid = getzoneid(); 1305 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 1306 sqp->sq_info.si_signo = sigevk.sigev_signo; 1307 sqp->sq_info.si_value = sigevk.sigev_value; 1308 head->lio_sigqp = sqp; 1309 } else { 1310 head->lio_sigqp = NULL; 1311 } 1312 if (pkevtp) { 1313 /* 1314 * Prepare data to send when list of aiocb's 1315 * has completed. 1316 */ 1317 port_init_event(pkevtp, (uintptr_t)sigev, 1318 (void *)(uintptr_t)pnotify.portnfy_user, 1319 NULL, head); 1320 pkevtp->portkev_events = AIOLIO; 1321 head->lio_portkev = pkevtp; 1322 head->lio_port = pnotify.portnfy_port; 1323 } 1324 } 1325 1326 for (i = 0; i < nent; i++, ucbp++) { 1327 1328 cbp = *ucbp; 1329 /* skip entry if it can't be copied. */ 1330 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) { 1331 if (head) { 1332 mutex_enter(&aiop->aio_mutex); 1333 head->lio_nent--; 1334 head->lio_refcnt--; 1335 mutex_exit(&aiop->aio_mutex); 1336 } 1337 continue; 1338 } 1339 1340 /* skip if opcode for aiocb is LIO_NOP */ 1341 mode = aiocb->aio_lio_opcode; 1342 if (mode == LIO_NOP) { 1343 cbp = NULL; 1344 if (head) { 1345 mutex_enter(&aiop->aio_mutex); 1346 head->lio_nent--; 1347 head->lio_refcnt--; 1348 mutex_exit(&aiop->aio_mutex); 1349 } 1350 continue; 1351 } 1352 1353 /* increment file descriptor's ref count. */ 1354 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 1355 lio_set_uerror(&cbp->aio_resultp, EBADF); 1356 if (head) { 1357 mutex_enter(&aiop->aio_mutex); 1358 head->lio_nent--; 1359 head->lio_refcnt--; 1360 mutex_exit(&aiop->aio_mutex); 1361 } 1362 aio_errors++; 1363 continue; 1364 } 1365 1366 /* 1367 * check the permission of the partition 1368 */ 1369 if ((fp->f_flag & mode) == 0) { 1370 releasef(aiocb->aio_fildes); 1371 lio_set_uerror(&cbp->aio_resultp, EBADF); 1372 if (head) { 1373 mutex_enter(&aiop->aio_mutex); 1374 head->lio_nent--; 1375 head->lio_refcnt--; 1376 mutex_exit(&aiop->aio_mutex); 1377 } 1378 aio_errors++; 1379 continue; 1380 } 1381 1382 /* 1383 * common case where requests are to the same fd 1384 * for the same r/w operation. 1385 * for UFS, need to set EBADFD 1386 */ 1387 vp = fp->f_vnode; 1388 if (fp != prev_fp || mode != prev_mode) { 1389 aio_func = check_vp(vp, mode); 1390 if (aio_func == NULL) { 1391 prev_fp = NULL; 1392 releasef(aiocb->aio_fildes); 1393 lio_set_uerror(&cbp->aio_resultp, EBADFD); 1394 aio_notsupported++; 1395 if (head) { 1396 mutex_enter(&aiop->aio_mutex); 1397 head->lio_nent--; 1398 head->lio_refcnt--; 1399 mutex_exit(&aiop->aio_mutex); 1400 } 1401 continue; 1402 } else { 1403 prev_fp = fp; 1404 prev_mode = mode; 1405 } 1406 } 1407 1408 error = aio_req_setup(&reqp, aiop, aiocb, 1409 &cbp->aio_resultp, vp); 1410 if (error) { 1411 releasef(aiocb->aio_fildes); 1412 lio_set_uerror(&cbp->aio_resultp, error); 1413 if (head) { 1414 mutex_enter(&aiop->aio_mutex); 1415 head->lio_nent--; 1416 head->lio_refcnt--; 1417 mutex_exit(&aiop->aio_mutex); 1418 } 1419 aio_errors++; 1420 continue; 1421 } 1422 1423 reqp->aio_req_lio = head; 1424 deadhead = 0; 1425 1426 /* 1427 * Set the errno field now before sending the request to 1428 * the driver to avoid a race condition 1429 */ 1430 (void) suword32(&cbp->aio_resultp.aio_errno, 1431 EINPROGRESS); 1432 1433 reqp->aio_req_iocb.iocb = (caddr_t)cbp; 1434 1435 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE; 1436 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 1437 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 1438 if (aio_port | aio_thread) { 1439 port_kevent_t *lpkevp; 1440 /* 1441 * Prepare data to send with each aiocb completed. 1442 */ 1443 if (aio_port) { 1444 void *paddr = 1445 aiocb->aio_sigevent.sigev_value.sival_ptr; 1446 if (copyin(paddr, &pnotify, sizeof (pnotify))) 1447 error = EFAULT; 1448 } else { /* aio_thread */ 1449 pnotify.portnfy_port = 1450 aiocb->aio_sigevent.sigev_signo; 1451 pnotify.portnfy_user = 1452 aiocb->aio_sigevent.sigev_value.sival_ptr; 1453 } 1454 if (error) 1455 /* EMPTY */; 1456 else if (pkevtp != NULL && 1457 pnotify.portnfy_port == lio_head_port) 1458 error = port_dup_event(pkevtp, &lpkevp, 1459 PORT_ALLOC_DEFAULT); 1460 else 1461 error = port_alloc_event(pnotify.portnfy_port, 1462 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 1463 &lpkevp); 1464 if (error == 0) { 1465 port_init_event(lpkevp, (uintptr_t)cbp, 1466 (void *)(uintptr_t)pnotify.portnfy_user, 1467 aio_port_callback, reqp); 1468 lpkevp->portkev_events = event; 1469 reqp->aio_req_portkev = lpkevp; 1470 reqp->aio_req_port = pnotify.portnfy_port; 1471 } 1472 } 1473 1474 /* 1475 * send the request to driver. 1476 */ 1477 if (error == 0) { 1478 if (aiocb->aio_nbytes == 0) { 1479 clear_active_fd(aiocb->aio_fildes); 1480 aio_zerolen(reqp); 1481 continue; 1482 } 1483 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 1484 CRED()); 1485 } 1486 1487 /* 1488 * the fd's ref count is not decremented until the IO has 1489 * completed unless there was an error. 1490 */ 1491 if (error) { 1492 releasef(aiocb->aio_fildes); 1493 lio_set_uerror(&cbp->aio_resultp, error); 1494 if (head) { 1495 mutex_enter(&aiop->aio_mutex); 1496 head->lio_nent--; 1497 head->lio_refcnt--; 1498 mutex_exit(&aiop->aio_mutex); 1499 } 1500 if (error == ENOTSUP) 1501 aio_notsupported++; 1502 else 1503 aio_errors++; 1504 lio_set_error(reqp); 1505 } else { 1506 clear_active_fd(aiocb->aio_fildes); 1507 } 1508 } 1509 1510 if (aio_notsupported) { 1511 error = ENOTSUP; 1512 } else if (aio_errors) { 1513 /* 1514 * return EIO if any request failed 1515 */ 1516 error = EIO; 1517 } 1518 1519 if (mode_arg == LIO_WAIT) { 1520 mutex_enter(&aiop->aio_mutex); 1521 while (head->lio_refcnt > 0) { 1522 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 1523 mutex_exit(&aiop->aio_mutex); 1524 error = EINTR; 1525 goto done; 1526 } 1527 } 1528 mutex_exit(&aiop->aio_mutex); 1529 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64); 1530 } 1531 1532 done: 1533 kmem_free(cbplist, ssize); 1534 if (deadhead) { 1535 if (head->lio_sigqp) 1536 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 1537 if (head->lio_portkev) 1538 port_free_event(head->lio_portkev); 1539 kmem_free(head, sizeof (aio_lio_t)); 1540 } 1541 return (error); 1542 } 1543 1544 #endif /* _LP64 */ 1545 1546 /* 1547 * Asynchronous list IO. 1548 * If list I/O is called with LIO_WAIT it can still return 1549 * before all the I/O's are completed if a signal is caught 1550 * or if the list include UFS I/O requests. If this happens, 1551 * libaio will call aliowait() to wait for the I/O's to 1552 * complete 1553 */ 1554 /*ARGSUSED*/ 1555 static int 1556 aliowait( 1557 int mode, 1558 void *aiocb, 1559 int nent, 1560 void *sigev, 1561 int run_mode) 1562 { 1563 aio_lio_t *head; 1564 aio_t *aiop; 1565 caddr_t cbplist; 1566 aiocb_t *cbp, **ucbp; 1567 #ifdef _SYSCALL32_IMPL 1568 aiocb32_t *cbp32; 1569 caddr32_t *ucbp32; 1570 aiocb64_32_t *cbp64; 1571 #endif 1572 int error = 0; 1573 int i; 1574 size_t ssize = 0; 1575 model_t model = get_udatamodel(); 1576 1577 aiop = curproc->p_aio; 1578 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 1579 return (EINVAL); 1580 1581 if (model == DATAMODEL_NATIVE) 1582 ssize = (sizeof (aiocb_t *) * nent); 1583 #ifdef _SYSCALL32_IMPL 1584 else 1585 ssize = (sizeof (caddr32_t) * nent); 1586 #endif /* _SYSCALL32_IMPL */ 1587 1588 if (ssize == 0) 1589 return (EINVAL); 1590 1591 cbplist = kmem_alloc(ssize, KM_SLEEP); 1592 1593 if (model == DATAMODEL_NATIVE) 1594 ucbp = (aiocb_t **)cbplist; 1595 #ifdef _SYSCALL32_IMPL 1596 else 1597 ucbp32 = (caddr32_t *)cbplist; 1598 #endif /* _SYSCALL32_IMPL */ 1599 1600 if (copyin(aiocb, cbplist, ssize)) { 1601 error = EFAULT; 1602 goto done; 1603 } 1604 1605 /* 1606 * To find the list head, we go through the 1607 * list of aiocb structs, find the request 1608 * its for, then get the list head that reqp 1609 * points to 1610 */ 1611 head = NULL; 1612 1613 for (i = 0; i < nent; i++) { 1614 if (model == DATAMODEL_NATIVE) { 1615 /* 1616 * Since we are only checking for a NULL pointer 1617 * Following should work on both native data sizes 1618 * as well as for largefile aiocb. 1619 */ 1620 if ((cbp = *ucbp++) == NULL) 1621 continue; 1622 if (run_mode != AIO_LARGEFILE) 1623 if (head = aio_list_get(&cbp->aio_resultp)) 1624 break; 1625 else { 1626 /* 1627 * This is a case when largefile call is 1628 * made on 32 bit kernel. 1629 * Treat each pointer as pointer to 1630 * aiocb64_32 1631 */ 1632 if (head = aio_list_get((aio_result_t *) 1633 &(((aiocb64_32_t *)cbp)->aio_resultp))) 1634 break; 1635 } 1636 } 1637 #ifdef _SYSCALL32_IMPL 1638 else { 1639 if (run_mode == AIO_LARGEFILE) { 1640 if ((cbp64 = (aiocb64_32_t *) 1641 (uintptr_t)*ucbp32++) == NULL) 1642 continue; 1643 if (head = aio_list_get((aio_result_t *) 1644 &cbp64->aio_resultp)) 1645 break; 1646 } else if (run_mode == AIO_32) { 1647 if ((cbp32 = (aiocb32_t *) 1648 (uintptr_t)*ucbp32++) == NULL) 1649 continue; 1650 if (head = aio_list_get((aio_result_t *) 1651 &cbp32->aio_resultp)) 1652 break; 1653 } 1654 } 1655 #endif /* _SYSCALL32_IMPL */ 1656 } 1657 1658 if (head == NULL) { 1659 error = EINVAL; 1660 goto done; 1661 } 1662 1663 mutex_enter(&aiop->aio_mutex); 1664 while (head->lio_refcnt > 0) { 1665 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 1666 mutex_exit(&aiop->aio_mutex); 1667 error = EINTR; 1668 goto done; 1669 } 1670 } 1671 mutex_exit(&aiop->aio_mutex); 1672 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode); 1673 done: 1674 kmem_free(cbplist, ssize); 1675 return (error); 1676 } 1677 1678 aio_lio_t * 1679 aio_list_get(aio_result_t *resultp) 1680 { 1681 aio_lio_t *head = NULL; 1682 aio_t *aiop; 1683 aio_req_t **bucket; 1684 aio_req_t *reqp; 1685 long index; 1686 1687 aiop = curproc->p_aio; 1688 if (aiop == NULL) 1689 return (NULL); 1690 1691 if (resultp) { 1692 index = AIO_HASH(resultp); 1693 bucket = &aiop->aio_hash[index]; 1694 for (reqp = *bucket; reqp != NULL; 1695 reqp = reqp->aio_hash_next) { 1696 if (reqp->aio_req_resultp == resultp) { 1697 head = reqp->aio_req_lio; 1698 return (head); 1699 } 1700 } 1701 } 1702 return (NULL); 1703 } 1704 1705 1706 static void 1707 lio_set_uerror(void *resultp, int error) 1708 { 1709 /* 1710 * the resultp field is a pointer to where the 1711 * error should be written out to the user's 1712 * aiocb. 1713 * 1714 */ 1715 if (get_udatamodel() == DATAMODEL_NATIVE) { 1716 (void) sulword(&((aio_result_t *)resultp)->aio_return, 1717 (ssize_t)-1); 1718 (void) suword32(&((aio_result_t *)resultp)->aio_errno, error); 1719 } 1720 #ifdef _SYSCALL32_IMPL 1721 else { 1722 (void) suword32(&((aio_result32_t *)resultp)->aio_return, 1723 (uint_t)-1); 1724 (void) suword32(&((aio_result32_t *)resultp)->aio_errno, error); 1725 } 1726 #endif /* _SYSCALL32_IMPL */ 1727 } 1728 1729 /* 1730 * do cleanup completion for all requests in list. memory for 1731 * each request is also freed. 1732 */ 1733 static void 1734 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode) 1735 { 1736 int i; 1737 aio_req_t *reqp; 1738 aio_result_t *resultp; 1739 aiocb64_32_t *aiocb_64; 1740 1741 for (i = 0; i < nent; i++) { 1742 if (get_udatamodel() == DATAMODEL_NATIVE) { 1743 if (cbp[i] == NULL) 1744 continue; 1745 if (run_mode == AIO_LARGEFILE) { 1746 aiocb_64 = (aiocb64_32_t *)cbp[i]; 1747 resultp = (aio_result_t *) 1748 &aiocb_64->aio_resultp; 1749 } else 1750 resultp = &cbp[i]->aio_resultp; 1751 } 1752 #ifdef _SYSCALL32_IMPL 1753 else { 1754 aiocb32_t *aiocb_32; 1755 caddr32_t *cbp32; 1756 1757 cbp32 = (caddr32_t *)cbp; 1758 if (cbp32[i] == NULL) 1759 continue; 1760 if (run_mode == AIO_32) { 1761 aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i]; 1762 resultp = (aio_result_t *)&aiocb_32-> 1763 aio_resultp; 1764 } else if (run_mode == AIO_LARGEFILE) { 1765 aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i]; 1766 resultp = (aio_result_t *)&aiocb_64-> 1767 aio_resultp; 1768 } 1769 } 1770 #endif /* _SYSCALL32_IMPL */ 1771 /* 1772 * we need to get the aio_cleanupq_mutex since we call 1773 * aio_req_done(). 1774 */ 1775 mutex_enter(&aiop->aio_cleanupq_mutex); 1776 mutex_enter(&aiop->aio_mutex); 1777 reqp = aio_req_done(resultp); 1778 mutex_exit(&aiop->aio_mutex); 1779 mutex_exit(&aiop->aio_cleanupq_mutex); 1780 if (reqp != NULL) { 1781 aphysio_unlock(reqp); 1782 aio_copyout_result(reqp); 1783 mutex_enter(&aiop->aio_mutex); 1784 aio_req_free(aiop, reqp); 1785 mutex_exit(&aiop->aio_mutex); 1786 } 1787 } 1788 } 1789 1790 /* 1791 * Write out the results for an aio request that is done. 1792 */ 1793 static int 1794 aioerror(void *cb, int run_mode) 1795 { 1796 aio_result_t *resultp; 1797 aio_t *aiop; 1798 aio_req_t *reqp; 1799 int retval; 1800 1801 aiop = curproc->p_aio; 1802 if (aiop == NULL || cb == NULL) 1803 return (EINVAL); 1804 1805 if (get_udatamodel() == DATAMODEL_NATIVE) { 1806 if (run_mode == AIO_LARGEFILE) 1807 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)-> 1808 aio_resultp; 1809 else 1810 resultp = &((aiocb_t *)cb)->aio_resultp; 1811 } 1812 #ifdef _SYSCALL32_IMPL 1813 else { 1814 if (run_mode == AIO_LARGEFILE) 1815 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)-> 1816 aio_resultp; 1817 else if (run_mode == AIO_32) 1818 resultp = (aio_result_t *)&((aiocb32_t *)cb)-> 1819 aio_resultp; 1820 } 1821 #endif /* _SYSCALL32_IMPL */ 1822 /* 1823 * we need to get the aio_cleanupq_mutex since we call 1824 * aio_req_find(). 1825 */ 1826 mutex_enter(&aiop->aio_cleanupq_mutex); 1827 mutex_enter(&aiop->aio_mutex); 1828 retval = aio_req_find(resultp, &reqp); 1829 mutex_exit(&aiop->aio_mutex); 1830 mutex_exit(&aiop->aio_cleanupq_mutex); 1831 if (retval == 0) { 1832 aphysio_unlock(reqp); 1833 aio_copyout_result(reqp); 1834 mutex_enter(&aiop->aio_mutex); 1835 aio_req_free(aiop, reqp); 1836 mutex_exit(&aiop->aio_mutex); 1837 return (0); 1838 } else if (retval == 1) 1839 return (EINPROGRESS); 1840 else if (retval == 2) 1841 return (EINVAL); 1842 return (0); 1843 } 1844 1845 /* 1846 * aio_cancel - if no requests outstanding, 1847 * return AIO_ALLDONE 1848 * else 1849 * return AIO_NOTCANCELED 1850 */ 1851 static int 1852 aio_cancel( 1853 int fildes, 1854 void *cb, 1855 long *rval, 1856 int run_mode) 1857 { 1858 aio_t *aiop; 1859 void *resultp; 1860 int index; 1861 aio_req_t **bucket; 1862 aio_req_t *ent; 1863 1864 1865 /* 1866 * Verify valid file descriptor 1867 */ 1868 if ((getf(fildes)) == NULL) { 1869 return (EBADF); 1870 } 1871 releasef(fildes); 1872 1873 aiop = curproc->p_aio; 1874 if (aiop == NULL) 1875 return (EINVAL); 1876 1877 if (aiop->aio_outstanding == 0) { 1878 *rval = AIO_ALLDONE; 1879 return (0); 1880 } 1881 1882 mutex_enter(&aiop->aio_mutex); 1883 if (cb != NULL) { 1884 if (get_udatamodel() == DATAMODEL_NATIVE) { 1885 if (run_mode == AIO_LARGEFILE) 1886 resultp = (aio_result_t *)&((aiocb64_32_t *)cb) 1887 ->aio_resultp; 1888 else 1889 resultp = &((aiocb_t *)cb)->aio_resultp; 1890 } 1891 #ifdef _SYSCALL32_IMPL 1892 else { 1893 if (run_mode == AIO_LARGEFILE) 1894 resultp = (aio_result_t *)&((aiocb64_32_t *)cb) 1895 ->aio_resultp; 1896 else if (run_mode == AIO_32) 1897 resultp = (aio_result_t *)&((aiocb32_t *)cb) 1898 ->aio_resultp; 1899 } 1900 #endif /* _SYSCALL32_IMPL */ 1901 index = AIO_HASH(resultp); 1902 bucket = &aiop->aio_hash[index]; 1903 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 1904 if (ent->aio_req_resultp == resultp) { 1905 if ((ent->aio_req_flags & AIO_PENDING) == 0) { 1906 mutex_exit(&aiop->aio_mutex); 1907 *rval = AIO_ALLDONE; 1908 return (0); 1909 } 1910 mutex_exit(&aiop->aio_mutex); 1911 *rval = AIO_NOTCANCELED; 1912 return (0); 1913 } 1914 } 1915 mutex_exit(&aiop->aio_mutex); 1916 *rval = AIO_ALLDONE; 1917 return (0); 1918 } 1919 1920 for (index = 0; index < AIO_HASHSZ; index++) { 1921 bucket = &aiop->aio_hash[index]; 1922 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 1923 if (ent->aio_req_fd == fildes) { 1924 if ((ent->aio_req_flags & AIO_PENDING) != 0) { 1925 mutex_exit(&aiop->aio_mutex); 1926 *rval = AIO_NOTCANCELED; 1927 return (0); 1928 } 1929 } 1930 } 1931 } 1932 mutex_exit(&aiop->aio_mutex); 1933 *rval = AIO_ALLDONE; 1934 return (0); 1935 } 1936 1937 /* 1938 * solaris version of asynchronous read and write 1939 */ 1940 static int 1941 arw( 1942 int opcode, 1943 int fdes, 1944 char *bufp, 1945 int bufsize, 1946 offset_t offset, 1947 aio_result_t *resultp, 1948 int mode) 1949 { 1950 file_t *fp; 1951 int error; 1952 struct vnode *vp; 1953 aio_req_t *reqp; 1954 aio_t *aiop; 1955 int (*aio_func)(); 1956 #ifdef _LP64 1957 aiocb_t aiocb; 1958 #else 1959 aiocb64_32_t aiocb64; 1960 #endif 1961 1962 aiop = curproc->p_aio; 1963 if (aiop == NULL) 1964 return (EINVAL); 1965 1966 if ((fp = getf(fdes)) == NULL) { 1967 return (EBADF); 1968 } 1969 1970 /* 1971 * check the permission of the partition 1972 */ 1973 if ((fp->f_flag & mode) == 0) { 1974 releasef(fdes); 1975 return (EBADF); 1976 } 1977 1978 vp = fp->f_vnode; 1979 aio_func = check_vp(vp, mode); 1980 if (aio_func == NULL) { 1981 releasef(fdes); 1982 return (EBADFD); 1983 } 1984 #ifdef _LP64 1985 aiocb.aio_fildes = fdes; 1986 aiocb.aio_buf = bufp; 1987 aiocb.aio_nbytes = bufsize; 1988 aiocb.aio_offset = offset; 1989 aiocb.aio_sigevent.sigev_notify = 0; 1990 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp); 1991 #else 1992 aiocb64.aio_fildes = fdes; 1993 aiocb64.aio_buf = (caddr32_t)bufp; 1994 aiocb64.aio_nbytes = bufsize; 1995 aiocb64.aio_offset = offset; 1996 aiocb64.aio_sigevent.sigev_notify = 0; 1997 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp); 1998 #endif 1999 if (error) { 2000 releasef(fdes); 2001 return (error); 2002 } 2003 2004 /* 2005 * enable polling on this request if the opcode has 2006 * the AIO poll bit set 2007 */ 2008 if (opcode & AIO_POLL_BIT) 2009 reqp->aio_req_flags |= AIO_POLL; 2010 2011 if (bufsize == 0) { 2012 clear_active_fd(fdes); 2013 aio_zerolen(reqp); 2014 return (0); 2015 } 2016 /* 2017 * send the request to driver. 2018 */ 2019 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED()); 2020 /* 2021 * the fd is stored in the aio_req_t by aio_req_setup(), and 2022 * is released by the aio_cleanup_thread() when the IO has 2023 * completed. 2024 */ 2025 if (error) { 2026 releasef(fdes); 2027 mutex_enter(&aiop->aio_mutex); 2028 aio_req_free(aiop, reqp); 2029 aiop->aio_pending--; 2030 if (aiop->aio_flags & AIO_REQ_BLOCK) 2031 cv_signal(&aiop->aio_cleanupcv); 2032 mutex_exit(&aiop->aio_mutex); 2033 return (error); 2034 } 2035 clear_active_fd(fdes); 2036 return (0); 2037 } 2038 2039 /* 2040 * posix version of asynchronous read and write 2041 */ 2042 static int 2043 aiorw( 2044 int opcode, 2045 void *aiocb_arg, 2046 int mode, 2047 int run_mode) 2048 { 2049 #ifdef _SYSCALL32_IMPL 2050 aiocb32_t aiocb32; 2051 struct sigevent32 *sigev32; 2052 port_notify32_t pntfy32; 2053 #endif 2054 aiocb64_32_t aiocb64; 2055 aiocb_t aiocb; 2056 file_t *fp; 2057 int error, fd; 2058 size_t bufsize; 2059 struct vnode *vp; 2060 aio_req_t *reqp; 2061 aio_t *aiop; 2062 int (*aio_func)(); 2063 aio_result_t *resultp; 2064 struct sigevent *sigev; 2065 model_t model; 2066 int aio_use_port = 0; 2067 port_notify_t pntfy; 2068 2069 model = get_udatamodel(); 2070 aiop = curproc->p_aio; 2071 if (aiop == NULL) 2072 return (EINVAL); 2073 2074 if (model == DATAMODEL_NATIVE) { 2075 if (run_mode != AIO_LARGEFILE) { 2076 if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t))) 2077 return (EFAULT); 2078 bufsize = aiocb.aio_nbytes; 2079 resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp); 2080 if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) { 2081 return (EBADF); 2082 } 2083 sigev = &aiocb.aio_sigevent; 2084 } else { 2085 /* 2086 * We come here only when we make largefile 2087 * call on 32 bit kernel using 32 bit library. 2088 */ 2089 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t))) 2090 return (EFAULT); 2091 bufsize = aiocb64.aio_nbytes; 2092 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg) 2093 ->aio_resultp); 2094 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) 2095 return (EBADF); 2096 sigev = (struct sigevent *)&aiocb64.aio_sigevent; 2097 } 2098 2099 if (sigev->sigev_notify == SIGEV_PORT) { 2100 if (copyin((void *)sigev->sigev_value.sival_ptr, 2101 &pntfy, sizeof (port_notify_t))) { 2102 releasef(fd); 2103 return (EFAULT); 2104 } 2105 aio_use_port = 1; 2106 } else if (sigev->sigev_notify == SIGEV_THREAD) { 2107 pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo; 2108 pntfy.portnfy_user = 2109 aiocb.aio_sigevent.sigev_value.sival_ptr; 2110 aio_use_port = 1; 2111 } 2112 } 2113 #ifdef _SYSCALL32_IMPL 2114 else { 2115 if (run_mode == AIO_32) { 2116 /* 32 bit system call is being made on 64 bit kernel */ 2117 if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t))) 2118 return (EFAULT); 2119 2120 bufsize = aiocb32.aio_nbytes; 2121 aiocb_32ton(&aiocb32, &aiocb); 2122 resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)-> 2123 aio_resultp); 2124 if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) { 2125 return (EBADF); 2126 } 2127 sigev32 = &aiocb32.aio_sigevent; 2128 } else if (run_mode == AIO_LARGEFILE) { 2129 /* 2130 * We come here only when we make largefile 2131 * call on 64 bit kernel using 32 bit library. 2132 */ 2133 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t))) 2134 return (EFAULT); 2135 bufsize = aiocb64.aio_nbytes; 2136 aiocb_LFton(&aiocb64, &aiocb); 2137 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg) 2138 ->aio_resultp); 2139 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) 2140 return (EBADF); 2141 sigev32 = &aiocb64.aio_sigevent; 2142 } 2143 2144 if (sigev32->sigev_notify == SIGEV_PORT) { 2145 if (copyin( 2146 (void *)(uintptr_t)sigev32->sigev_value.sival_ptr, 2147 &pntfy32, sizeof (port_notify32_t))) { 2148 releasef(fd); 2149 return (EFAULT); 2150 } 2151 pntfy.portnfy_port = pntfy32.portnfy_port; 2152 pntfy.portnfy_user = (void *)(uintptr_t) 2153 pntfy32.portnfy_user; 2154 aio_use_port = 1; 2155 } else if (sigev32->sigev_notify == SIGEV_THREAD) { 2156 pntfy.portnfy_port = sigev32->sigev_signo; 2157 pntfy.portnfy_user = (void *)(uintptr_t) 2158 sigev32->sigev_value.sival_ptr; 2159 aio_use_port = 1; 2160 } 2161 } 2162 #endif /* _SYSCALL32_IMPL */ 2163 2164 /* 2165 * check the permission of the partition 2166 */ 2167 2168 if ((fp->f_flag & mode) == 0) { 2169 releasef(fd); 2170 return (EBADF); 2171 } 2172 2173 vp = fp->f_vnode; 2174 aio_func = check_vp(vp, mode); 2175 if (aio_func == NULL) { 2176 releasef(fd); 2177 return (EBADFD); 2178 } 2179 if (run_mode == AIO_LARGEFILE) 2180 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp); 2181 else 2182 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp); 2183 2184 if (error) { 2185 releasef(fd); 2186 return (error); 2187 } 2188 /* 2189 * enable polling on this request if the opcode has 2190 * the AIO poll bit set 2191 */ 2192 if (opcode & AIO_POLL_BIT) 2193 reqp->aio_req_flags |= AIO_POLL; 2194 2195 if (model == DATAMODEL_NATIVE) 2196 reqp->aio_req_iocb.iocb = aiocb_arg; 2197 #ifdef _SYSCALL32_IMPL 2198 else 2199 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg; 2200 #endif 2201 2202 if (aio_use_port) { 2203 int event = (run_mode == AIO_LARGEFILE)? 2204 ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) : 2205 ((mode == FREAD)? AIOAREAD : AIOAWRITE); 2206 error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event); 2207 } 2208 2209 /* 2210 * send the request to driver. 2211 */ 2212 if (error == 0) { 2213 if (bufsize == 0) { 2214 clear_active_fd(fd); 2215 aio_zerolen(reqp); 2216 return (0); 2217 } 2218 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED()); 2219 } 2220 2221 /* 2222 * the fd is stored in the aio_req_t by aio_req_setup(), and 2223 * is released by the aio_cleanup_thread() when the IO has 2224 * completed. 2225 */ 2226 if (error) { 2227 releasef(fd); 2228 mutex_enter(&aiop->aio_mutex); 2229 aio_deq(&aiop->aio_portpending, reqp); 2230 aio_req_free(aiop, reqp); 2231 aiop->aio_pending--; 2232 if (aiop->aio_flags & AIO_REQ_BLOCK) 2233 cv_signal(&aiop->aio_cleanupcv); 2234 mutex_exit(&aiop->aio_mutex); 2235 return (error); 2236 } 2237 clear_active_fd(fd); 2238 return (0); 2239 } 2240 2241 2242 /* 2243 * set error for a list IO entry that failed. 2244 */ 2245 static void 2246 lio_set_error(aio_req_t *reqp) 2247 { 2248 aio_t *aiop = curproc->p_aio; 2249 2250 if (aiop == NULL) 2251 return; 2252 2253 mutex_enter(&aiop->aio_mutex); 2254 aio_deq(&aiop->aio_portpending, reqp); 2255 aiop->aio_pending--; 2256 /* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */ 2257 reqp->aio_req_flags |= AIO_PHYSIODONE; 2258 /* 2259 * Need to free the request now as its never 2260 * going to get on the done queue 2261 * 2262 * Note: aio_outstanding is decremented in 2263 * aio_req_free() 2264 */ 2265 aio_req_free(aiop, reqp); 2266 if (aiop->aio_flags & AIO_REQ_BLOCK) 2267 cv_signal(&aiop->aio_cleanupcv); 2268 mutex_exit(&aiop->aio_mutex); 2269 } 2270 2271 /* 2272 * check if a specified request is done, and remove it from 2273 * the done queue. otherwise remove anybody from the done queue 2274 * if NULL is specified. 2275 */ 2276 static aio_req_t * 2277 aio_req_done(void *resultp) 2278 { 2279 aio_req_t **bucket; 2280 aio_req_t *ent; 2281 aio_t *aiop = curproc->p_aio; 2282 long index; 2283 2284 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex)); 2285 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2286 2287 if (resultp) { 2288 index = AIO_HASH(resultp); 2289 bucket = &aiop->aio_hash[index]; 2290 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 2291 if (ent->aio_req_resultp == (aio_result_t *)resultp) { 2292 if (ent->aio_req_flags & AIO_DONEQ) { 2293 return (aio_req_remove(ent)); 2294 } 2295 return (NULL); 2296 } 2297 } 2298 /* no match, resultp is invalid */ 2299 return (NULL); 2300 } 2301 return (aio_req_remove(NULL)); 2302 } 2303 2304 /* 2305 * determine if a user-level resultp pointer is associated with an 2306 * active IO request. Zero is returned when the request is done, 2307 * and the request is removed from the done queue. Only when the 2308 * return value is zero, is the "reqp" pointer valid. One is returned 2309 * when the request is inprogress. Two is returned when the request 2310 * is invalid. 2311 */ 2312 static int 2313 aio_req_find(aio_result_t *resultp, aio_req_t **reqp) 2314 { 2315 aio_req_t **bucket; 2316 aio_req_t *ent; 2317 aio_t *aiop = curproc->p_aio; 2318 long index; 2319 2320 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex)); 2321 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2322 2323 index = AIO_HASH(resultp); 2324 bucket = &aiop->aio_hash[index]; 2325 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 2326 if (ent->aio_req_resultp == resultp) { 2327 if (ent->aio_req_flags & AIO_DONEQ) { 2328 *reqp = aio_req_remove(ent); 2329 return (0); 2330 } 2331 return (1); 2332 } 2333 } 2334 /* no match, resultp is invalid */ 2335 return (2); 2336 } 2337 2338 /* 2339 * remove a request from the done queue. 2340 */ 2341 static aio_req_t * 2342 aio_req_remove(aio_req_t *reqp) 2343 { 2344 aio_t *aiop = curproc->p_aio; 2345 2346 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2347 2348 if (reqp != NULL) { 2349 ASSERT(reqp->aio_req_flags & AIO_DONEQ); 2350 if (reqp->aio_req_next == reqp) { 2351 /* only one request on queue */ 2352 if (reqp == aiop->aio_doneq) { 2353 aiop->aio_doneq = NULL; 2354 } else { 2355 ASSERT(reqp == aiop->aio_cleanupq); 2356 aiop->aio_cleanupq = NULL; 2357 } 2358 } else { 2359 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev; 2360 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next; 2361 /* 2362 * The request can be either on the aio_doneq or the 2363 * aio_cleanupq 2364 */ 2365 if (reqp == aiop->aio_doneq) 2366 aiop->aio_doneq = reqp->aio_req_next; 2367 2368 if (reqp == aiop->aio_cleanupq) 2369 aiop->aio_cleanupq = reqp->aio_req_next; 2370 } 2371 reqp->aio_req_flags &= ~AIO_DONEQ; 2372 reqp->aio_req_next = NULL; 2373 reqp->aio_req_prev = NULL; 2374 } else if ((reqp = aiop->aio_doneq) != NULL) { 2375 ASSERT(reqp->aio_req_flags & AIO_DONEQ); 2376 if (reqp == reqp->aio_req_next) { 2377 /* only one request on queue */ 2378 aiop->aio_doneq = NULL; 2379 } else { 2380 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next; 2381 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev; 2382 aiop->aio_doneq = reqp->aio_req_next; 2383 } 2384 reqp->aio_req_flags &= ~AIO_DONEQ; 2385 reqp->aio_req_next = NULL; 2386 reqp->aio_req_prev = NULL; 2387 } 2388 if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN)) 2389 cv_broadcast(&aiop->aio_waitcv); 2390 return (reqp); 2391 } 2392 2393 static int 2394 aio_req_setup( 2395 aio_req_t **reqpp, 2396 aio_t *aiop, 2397 aiocb_t *arg, 2398 aio_result_t *resultp, 2399 vnode_t *vp) 2400 { 2401 sigqueue_t *sqp = NULL; 2402 aio_req_t *reqp; 2403 struct uio *uio; 2404 struct sigevent *sigev; 2405 int error; 2406 2407 sigev = &arg->aio_sigevent; 2408 if (sigev->sigev_notify == SIGEV_SIGNAL && 2409 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) { 2410 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 2411 if (sqp == NULL) 2412 return (EAGAIN); 2413 sqp->sq_func = NULL; 2414 sqp->sq_next = NULL; 2415 sqp->sq_info.si_code = SI_ASYNCIO; 2416 sqp->sq_info.si_pid = curproc->p_pid; 2417 sqp->sq_info.si_ctid = PRCTID(curproc); 2418 sqp->sq_info.si_zoneid = getzoneid(); 2419 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 2420 sqp->sq_info.si_signo = sigev->sigev_signo; 2421 sqp->sq_info.si_value = sigev->sigev_value; 2422 } 2423 2424 mutex_enter(&aiop->aio_mutex); 2425 2426 if (aiop->aio_flags & AIO_REQ_BLOCK) { 2427 mutex_exit(&aiop->aio_mutex); 2428 if (sqp) 2429 kmem_free(sqp, sizeof (sigqueue_t)); 2430 return (EIO); 2431 } 2432 /* 2433 * get an aio_reqp from the free list or allocate one 2434 * from dynamic memory. 2435 */ 2436 if (error = aio_req_alloc(&reqp, resultp)) { 2437 mutex_exit(&aiop->aio_mutex); 2438 if (sqp) 2439 kmem_free(sqp, sizeof (sigqueue_t)); 2440 return (error); 2441 } 2442 aiop->aio_pending++; 2443 aiop->aio_outstanding++; 2444 reqp->aio_req_flags = AIO_PENDING; 2445 if (sigev->sigev_notify == SIGEV_THREAD || 2446 sigev->sigev_notify == SIGEV_PORT) 2447 aio_enq(&aiop->aio_portpending, reqp, 0); 2448 mutex_exit(&aiop->aio_mutex); 2449 /* 2450 * initialize aio request. 2451 */ 2452 reqp->aio_req_fd = arg->aio_fildes; 2453 reqp->aio_req_sigqp = sqp; 2454 reqp->aio_req_iocb.iocb = NULL; 2455 reqp->aio_req_lio = NULL; 2456 reqp->aio_req_buf.b_file = vp; 2457 uio = reqp->aio_req.aio_uio; 2458 uio->uio_iovcnt = 1; 2459 uio->uio_iov->iov_base = (caddr_t)arg->aio_buf; 2460 uio->uio_iov->iov_len = arg->aio_nbytes; 2461 uio->uio_loffset = arg->aio_offset; 2462 *reqpp = reqp; 2463 return (0); 2464 } 2465 2466 /* 2467 * Allocate p_aio struct. 2468 */ 2469 static aio_t * 2470 aio_aiop_alloc(void) 2471 { 2472 aio_t *aiop; 2473 2474 ASSERT(MUTEX_HELD(&curproc->p_lock)); 2475 2476 aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP); 2477 if (aiop) { 2478 mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL); 2479 mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT, 2480 NULL); 2481 mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL); 2482 } 2483 return (aiop); 2484 } 2485 2486 /* 2487 * Allocate an aio_req struct. 2488 */ 2489 static int 2490 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp) 2491 { 2492 aio_req_t *reqp; 2493 aio_t *aiop = curproc->p_aio; 2494 2495 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2496 2497 if ((reqp = aiop->aio_free) != NULL) { 2498 aiop->aio_free = reqp->aio_req_next; 2499 bzero(reqp, sizeof (*reqp)); 2500 } else { 2501 /* 2502 * Check whether memory is getting tight. 2503 * This is a temporary mechanism to avoid memory 2504 * exhaustion by a single process until we come up 2505 * with a per process solution such as setrlimit(). 2506 */ 2507 if (freemem < desfree) 2508 return (EAGAIN); 2509 reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP); 2510 if (reqp == NULL) 2511 return (EAGAIN); 2512 } 2513 reqp->aio_req.aio_uio = &reqp->aio_req_uio; 2514 reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov; 2515 reqp->aio_req.aio_private = reqp; 2516 reqp->aio_req_buf.b_offset = -1; 2517 reqp->aio_req_resultp = resultp; 2518 if (aio_hash_insert(reqp, aiop)) { 2519 reqp->aio_req_next = aiop->aio_free; 2520 aiop->aio_free = reqp; 2521 return (EINVAL); 2522 } 2523 *nreqp = reqp; 2524 return (0); 2525 } 2526 2527 /* 2528 * Allocate an aio_lio_t struct. 2529 */ 2530 static int 2531 aio_lio_alloc(aio_lio_t **head) 2532 { 2533 aio_lio_t *liop; 2534 aio_t *aiop = curproc->p_aio; 2535 2536 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2537 2538 if ((liop = aiop->aio_lio_free) != NULL) { 2539 aiop->aio_lio_free = liop->lio_next; 2540 } else { 2541 /* 2542 * Check whether memory is getting tight. 2543 * This is a temporary mechanism to avoid memory 2544 * exhaustion by a single process until we come up 2545 * with a per process solution such as setrlimit(). 2546 */ 2547 if (freemem < desfree) 2548 return (EAGAIN); 2549 2550 liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP); 2551 if (liop == NULL) 2552 return (EAGAIN); 2553 } 2554 *head = liop; 2555 return (0); 2556 } 2557 2558 /* 2559 * this is a special per-process thread that is only activated if 2560 * the process is unmapping a segment with outstanding aio. normally, 2561 * the process will have completed the aio before unmapping the 2562 * segment. If the process does unmap a segment with outstanding aio, 2563 * this special thread will guarentee that the locked pages due to 2564 * aphysio() are released, thereby permitting the segment to be 2565 * unmapped. In addition to this, the cleanup thread is woken up 2566 * during DR operations to release the locked pages. 2567 */ 2568 2569 static int 2570 aio_cleanup_thread(aio_t *aiop) 2571 { 2572 proc_t *p = curproc; 2573 struct as *as = p->p_as; 2574 int poked = 0; 2575 kcondvar_t *cvp; 2576 int exit_flag = 0; 2577 int rqclnup = 0; 2578 2579 sigfillset(&curthread->t_hold); 2580 sigdiffset(&curthread->t_hold, &cantmask); 2581 for (;;) { 2582 /* 2583 * if a segment is being unmapped, and the current 2584 * process's done queue is not empty, then every request 2585 * on the doneq with locked resources should be forced 2586 * to release their locks. By moving the doneq request 2587 * to the cleanupq, aio_cleanup() will process the cleanupq, 2588 * and place requests back onto the doneq. All requests 2589 * processed by aio_cleanup() will have their physical 2590 * resources unlocked. 2591 */ 2592 mutex_enter(&aiop->aio_mutex); 2593 if ((aiop->aio_flags & AIO_CLEANUP) == 0) { 2594 aiop->aio_flags |= AIO_CLEANUP; 2595 mutex_enter(&as->a_contents); 2596 if (aiop->aio_rqclnup) { 2597 aiop->aio_rqclnup = 0; 2598 rqclnup = 1; 2599 } 2600 2601 if ((rqclnup || AS_ISUNMAPWAIT(as)) && 2602 aiop->aio_doneq) { 2603 aio_req_t *doneqhead = aiop->aio_doneq; 2604 mutex_exit(&as->a_contents); 2605 aiop->aio_doneq = NULL; 2606 aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ); 2607 } else { 2608 mutex_exit(&as->a_contents); 2609 } 2610 } 2611 mutex_exit(&aiop->aio_mutex); 2612 aio_cleanup(AIO_CLEANUP_THREAD); 2613 /* 2614 * thread should block on the cleanupcv while 2615 * AIO_CLEANUP is set. 2616 */ 2617 cvp = &aiop->aio_cleanupcv; 2618 mutex_enter(&aiop->aio_mutex); 2619 2620 if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL || 2621 aiop->aio_notifyq != NULL || 2622 aiop->aio_portcleanupq != NULL) { 2623 mutex_exit(&aiop->aio_mutex); 2624 continue; 2625 } 2626 mutex_enter(&as->a_contents); 2627 2628 /* 2629 * AIO_CLEANUP determines when the cleanup thread 2630 * should be active. This flag is set when 2631 * the cleanup thread is awakened by as_unmap() or 2632 * due to DR operations. 2633 * The flag is cleared when the blocking as_unmap() 2634 * that originally awakened us is allowed to 2635 * complete. as_unmap() blocks when trying to 2636 * unmap a segment that has SOFTLOCKed pages. when 2637 * the segment's pages are all SOFTUNLOCKed, 2638 * as->a_flags & AS_UNMAPWAIT should be zero. 2639 * 2640 * In case of cleanup request by DR, the flag is cleared 2641 * once all the pending aio requests have been processed. 2642 * 2643 * The flag shouldn't be cleared right away if the 2644 * cleanup thread was interrupted because the process 2645 * is doing forkall(). This happens when cv_wait_sig() 2646 * returns zero, because it was awakened by a pokelwps(). 2647 * If the process is not exiting, it must be doing forkall(). 2648 */ 2649 if ((poked == 0) && 2650 ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) || 2651 (aiop->aio_pending == 0))) { 2652 aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT); 2653 cvp = &as->a_cv; 2654 rqclnup = 0; 2655 } 2656 mutex_exit(&aiop->aio_mutex); 2657 if (poked) { 2658 /* 2659 * If the process is exiting/killed, don't return 2660 * immediately without waiting for pending I/O's 2661 * and releasing the page locks. 2662 */ 2663 if (p->p_flag & (SEXITLWPS|SKILLED)) { 2664 /* 2665 * If exit_flag is set, then it is 2666 * safe to exit because we have released 2667 * page locks of completed I/O's. 2668 */ 2669 if (exit_flag) 2670 break; 2671 2672 mutex_exit(&as->a_contents); 2673 2674 /* 2675 * Wait for all the pending aio to complete. 2676 */ 2677 mutex_enter(&aiop->aio_mutex); 2678 aiop->aio_flags |= AIO_REQ_BLOCK; 2679 while (aiop->aio_pending != 0) 2680 cv_wait(&aiop->aio_cleanupcv, 2681 &aiop->aio_mutex); 2682 mutex_exit(&aiop->aio_mutex); 2683 exit_flag = 1; 2684 continue; 2685 } else if (p->p_flag & 2686 (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) { 2687 /* 2688 * hold LWP until it 2689 * is continued. 2690 */ 2691 mutex_exit(&as->a_contents); 2692 mutex_enter(&p->p_lock); 2693 stop(PR_SUSPENDED, SUSPEND_NORMAL); 2694 mutex_exit(&p->p_lock); 2695 poked = 0; 2696 continue; 2697 } 2698 } else { 2699 /* 2700 * When started this thread will sleep on as->a_cv. 2701 * as_unmap will awake this thread if the 2702 * segment has SOFTLOCKed pages (poked = 0). 2703 * 1. pokelwps() awakes this thread => 2704 * break the loop to check SEXITLWPS, SHOLDFORK, etc 2705 * 2. as_unmap awakes this thread => 2706 * to break the loop it is necessary that 2707 * - AS_UNMAPWAIT is set (as_unmap is waiting for 2708 * memory to be unlocked) 2709 * - AIO_CLEANUP is not set 2710 * (if AIO_CLEANUP is set we have to wait for 2711 * pending requests. aio_done will send a signal 2712 * for every request which completes to continue 2713 * unmapping the corresponding address range) 2714 * 3. A cleanup request will wake this thread up, ex. 2715 * by the DR operations. The aio_rqclnup flag will 2716 * be set. 2717 */ 2718 while (poked == 0) { 2719 /* 2720 * we need to handle cleanup requests 2721 * that come in after we had just cleaned up, 2722 * so that we do cleanup of any new aio 2723 * requests that got completed and have 2724 * locked resources. 2725 */ 2726 if ((aiop->aio_rqclnup || 2727 (AS_ISUNMAPWAIT(as) != 0)) && 2728 (aiop->aio_flags & AIO_CLEANUP) == 0) 2729 break; 2730 poked = !cv_wait_sig(cvp, &as->a_contents); 2731 if (AS_ISUNMAPWAIT(as) == 0) 2732 cv_signal(cvp); 2733 if (aiop->aio_outstanding != 0) 2734 break; 2735 } 2736 } 2737 mutex_exit(&as->a_contents); 2738 } 2739 exit: 2740 mutex_exit(&as->a_contents); 2741 ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED))); 2742 aston(curthread); /* make thread do post_syscall */ 2743 return (0); 2744 } 2745 2746 /* 2747 * save a reference to a user's outstanding aio in a hash list. 2748 */ 2749 static int 2750 aio_hash_insert( 2751 aio_req_t *aio_reqp, 2752 aio_t *aiop) 2753 { 2754 long index; 2755 aio_result_t *resultp = aio_reqp->aio_req_resultp; 2756 aio_req_t *current; 2757 aio_req_t **nextp; 2758 2759 index = AIO_HASH(resultp); 2760 nextp = &aiop->aio_hash[index]; 2761 while ((current = *nextp) != NULL) { 2762 if (current->aio_req_resultp == resultp) 2763 return (DUPLICATE); 2764 nextp = ¤t->aio_hash_next; 2765 } 2766 *nextp = aio_reqp; 2767 aio_reqp->aio_hash_next = NULL; 2768 return (0); 2769 } 2770 2771 static int 2772 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *, 2773 cred_t *) 2774 { 2775 struct snode *sp; 2776 dev_t dev; 2777 struct cb_ops *cb; 2778 major_t major; 2779 int (*aio_func)(); 2780 2781 dev = vp->v_rdev; 2782 major = getmajor(dev); 2783 2784 /* 2785 * return NULL for requests to files and STREAMs so 2786 * that libaio takes care of them. 2787 */ 2788 if (vp->v_type == VCHR) { 2789 /* no stream device for kaio */ 2790 if (STREAMSTAB(major)) { 2791 return (NULL); 2792 } 2793 } else { 2794 return (NULL); 2795 } 2796 2797 /* 2798 * Check old drivers which do not have async I/O entry points. 2799 */ 2800 if (devopsp[major]->devo_rev < 3) 2801 return (NULL); 2802 2803 cb = devopsp[major]->devo_cb_ops; 2804 2805 if (cb->cb_rev < 1) 2806 return (NULL); 2807 2808 /* 2809 * Check whether this device is a block device. 2810 * Kaio is not supported for devices like tty. 2811 */ 2812 if (cb->cb_strategy == nodev || cb->cb_strategy == NULL) 2813 return (NULL); 2814 2815 /* 2816 * Clustering: If vnode is a PXFS vnode, then the device may be remote. 2817 * We cannot call the driver directly. Instead return the 2818 * PXFS functions. 2819 */ 2820 2821 if (IS_PXFSVP(vp)) { 2822 if (mode & FREAD) 2823 return (clpxfs_aio_read); 2824 else 2825 return (clpxfs_aio_write); 2826 } 2827 if (mode & FREAD) 2828 aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read; 2829 else 2830 aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write; 2831 2832 /* 2833 * Do we need this ? 2834 * nodev returns ENXIO anyway. 2835 */ 2836 if (aio_func == nodev) 2837 return (NULL); 2838 2839 sp = VTOS(vp); 2840 smark(sp, SACC); 2841 return (aio_func); 2842 } 2843 2844 /* 2845 * Clustering: We want check_vp to return a function prototyped 2846 * correctly that will be common to both PXFS and regular case. 2847 * We define this intermediate function that will do the right 2848 * thing for driver cases. 2849 */ 2850 2851 static int 2852 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p) 2853 { 2854 dev_t dev; 2855 struct cb_ops *cb; 2856 2857 ASSERT(vp->v_type == VCHR); 2858 ASSERT(!IS_PXFSVP(vp)); 2859 dev = VTOS(vp)->s_dev; 2860 ASSERT(STREAMSTAB(getmajor(dev)) == NULL); 2861 2862 cb = devopsp[getmajor(dev)]->devo_cb_ops; 2863 2864 ASSERT(cb->cb_awrite != nodev); 2865 return ((*cb->cb_awrite)(dev, aio, cred_p)); 2866 } 2867 2868 /* 2869 * Clustering: We want check_vp to return a function prototyped 2870 * correctly that will be common to both PXFS and regular case. 2871 * We define this intermediate function that will do the right 2872 * thing for driver cases. 2873 */ 2874 2875 static int 2876 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p) 2877 { 2878 dev_t dev; 2879 struct cb_ops *cb; 2880 2881 ASSERT(vp->v_type == VCHR); 2882 ASSERT(!IS_PXFSVP(vp)); 2883 dev = VTOS(vp)->s_dev; 2884 ASSERT(!STREAMSTAB(getmajor(dev))); 2885 2886 cb = devopsp[getmajor(dev)]->devo_cb_ops; 2887 2888 ASSERT(cb->cb_aread != nodev); 2889 return ((*cb->cb_aread)(dev, aio, cred_p)); 2890 } 2891 2892 /* 2893 * This routine is called when a largefile call is made by a 32bit 2894 * process on a ILP32 or LP64 kernel. All 64bit processes are large 2895 * file by definition and will call alio() instead. 2896 */ 2897 static int 2898 alioLF( 2899 int mode_arg, 2900 void *aiocb_arg, 2901 int nent, 2902 void *sigev) 2903 { 2904 file_t *fp; 2905 file_t *prev_fp = NULL; 2906 int prev_mode = -1; 2907 struct vnode *vp; 2908 aio_lio_t *head; 2909 aio_req_t *reqp; 2910 aio_t *aiop; 2911 caddr_t cbplist; 2912 aiocb64_32_t cb64; 2913 aiocb64_32_t *aiocb = &cb64; 2914 aiocb64_32_t *cbp; 2915 caddr32_t *ucbp; 2916 #ifdef _LP64 2917 aiocb_t aiocb_n; 2918 #endif 2919 struct sigevent32 sigevk; 2920 sigqueue_t *sqp; 2921 int (*aio_func)(); 2922 int mode; 2923 int error = 0; 2924 int aio_errors = 0; 2925 int i; 2926 size_t ssize; 2927 int deadhead = 0; 2928 int aio_notsupported = 0; 2929 int lio_head_port; 2930 int aio_port; 2931 int aio_thread; 2932 port_kevent_t *pkevtp = NULL; 2933 port_notify32_t pnotify; 2934 int event; 2935 2936 aiop = curproc->p_aio; 2937 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 2938 return (EINVAL); 2939 2940 ASSERT(get_udatamodel() == DATAMODEL_ILP32); 2941 2942 ssize = (sizeof (caddr32_t) * nent); 2943 cbplist = kmem_alloc(ssize, KM_SLEEP); 2944 ucbp = (caddr32_t *)cbplist; 2945 2946 if (copyin(aiocb_arg, cbplist, ssize) || 2947 (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) { 2948 kmem_free(cbplist, ssize); 2949 return (EFAULT); 2950 } 2951 2952 /* Event Ports */ 2953 if (sigev && 2954 (sigevk.sigev_notify == SIGEV_THREAD || 2955 sigevk.sigev_notify == SIGEV_PORT)) { 2956 if (sigevk.sigev_notify == SIGEV_THREAD) { 2957 pnotify.portnfy_port = sigevk.sigev_signo; 2958 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 2959 } else if (copyin( 2960 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr, 2961 &pnotify, sizeof (pnotify))) { 2962 kmem_free(cbplist, ssize); 2963 return (EFAULT); 2964 } 2965 error = port_alloc_event(pnotify.portnfy_port, 2966 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 2967 if (error) { 2968 if (error == ENOMEM || error == EAGAIN) 2969 error = EAGAIN; 2970 else 2971 error = EINVAL; 2972 kmem_free(cbplist, ssize); 2973 return (error); 2974 } 2975 lio_head_port = pnotify.portnfy_port; 2976 } 2977 2978 /* 2979 * a list head should be allocated if notification is 2980 * enabled for this list. 2981 */ 2982 head = NULL; 2983 2984 if (mode_arg == LIO_WAIT || sigev) { 2985 mutex_enter(&aiop->aio_mutex); 2986 error = aio_lio_alloc(&head); 2987 mutex_exit(&aiop->aio_mutex); 2988 if (error) 2989 goto done; 2990 deadhead = 1; 2991 head->lio_nent = nent; 2992 head->lio_refcnt = nent; 2993 head->lio_port = -1; 2994 head->lio_portkev = NULL; 2995 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 2996 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 2997 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 2998 if (sqp == NULL) { 2999 error = EAGAIN; 3000 goto done; 3001 } 3002 sqp->sq_func = NULL; 3003 sqp->sq_next = NULL; 3004 sqp->sq_info.si_code = SI_ASYNCIO; 3005 sqp->sq_info.si_pid = curproc->p_pid; 3006 sqp->sq_info.si_ctid = PRCTID(curproc); 3007 sqp->sq_info.si_zoneid = getzoneid(); 3008 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3009 sqp->sq_info.si_signo = sigevk.sigev_signo; 3010 sqp->sq_info.si_value.sival_int = 3011 sigevk.sigev_value.sival_int; 3012 head->lio_sigqp = sqp; 3013 } else { 3014 head->lio_sigqp = NULL; 3015 } 3016 if (pkevtp) { 3017 /* 3018 * Prepare data to send when list of aiocb's 3019 * has completed. 3020 */ 3021 port_init_event(pkevtp, (uintptr_t)sigev, 3022 (void *)(uintptr_t)pnotify.portnfy_user, 3023 NULL, head); 3024 pkevtp->portkev_events = AIOLIO64; 3025 head->lio_portkev = pkevtp; 3026 head->lio_port = pnotify.portnfy_port; 3027 } 3028 } 3029 3030 for (i = 0; i < nent; i++, ucbp++) { 3031 3032 cbp = (aiocb64_32_t *)(uintptr_t)*ucbp; 3033 /* skip entry if it can't be copied. */ 3034 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) { 3035 if (head) { 3036 mutex_enter(&aiop->aio_mutex); 3037 head->lio_nent--; 3038 head->lio_refcnt--; 3039 mutex_exit(&aiop->aio_mutex); 3040 } 3041 continue; 3042 } 3043 3044 /* skip if opcode for aiocb is LIO_NOP */ 3045 mode = aiocb->aio_lio_opcode; 3046 if (mode == LIO_NOP) { 3047 cbp = NULL; 3048 if (head) { 3049 mutex_enter(&aiop->aio_mutex); 3050 head->lio_nent--; 3051 head->lio_refcnt--; 3052 mutex_exit(&aiop->aio_mutex); 3053 } 3054 continue; 3055 } 3056 3057 /* increment file descriptor's ref count. */ 3058 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 3059 lio_set_uerror(&cbp->aio_resultp, EBADF); 3060 if (head) { 3061 mutex_enter(&aiop->aio_mutex); 3062 head->lio_nent--; 3063 head->lio_refcnt--; 3064 mutex_exit(&aiop->aio_mutex); 3065 } 3066 aio_errors++; 3067 continue; 3068 } 3069 3070 /* 3071 * check the permission of the partition 3072 */ 3073 if ((fp->f_flag & mode) == 0) { 3074 releasef(aiocb->aio_fildes); 3075 lio_set_uerror(&cbp->aio_resultp, EBADF); 3076 if (head) { 3077 mutex_enter(&aiop->aio_mutex); 3078 head->lio_nent--; 3079 head->lio_refcnt--; 3080 mutex_exit(&aiop->aio_mutex); 3081 } 3082 aio_errors++; 3083 continue; 3084 } 3085 3086 /* 3087 * common case where requests are to the same fd 3088 * for the same r/w operation 3089 * for UFS, need to set EBADFD 3090 */ 3091 vp = fp->f_vnode; 3092 if (fp != prev_fp || mode != prev_mode) { 3093 aio_func = check_vp(vp, mode); 3094 if (aio_func == NULL) { 3095 prev_fp = NULL; 3096 releasef(aiocb->aio_fildes); 3097 lio_set_uerror(&cbp->aio_resultp, EBADFD); 3098 aio_notsupported++; 3099 if (head) { 3100 mutex_enter(&aiop->aio_mutex); 3101 head->lio_nent--; 3102 head->lio_refcnt--; 3103 mutex_exit(&aiop->aio_mutex); 3104 } 3105 continue; 3106 } else { 3107 prev_fp = fp; 3108 prev_mode = mode; 3109 } 3110 } 3111 3112 #ifdef _LP64 3113 aiocb_LFton(aiocb, &aiocb_n); 3114 error = aio_req_setup(&reqp, aiop, &aiocb_n, 3115 (aio_result_t *)&cbp->aio_resultp, vp); 3116 #else 3117 error = aio_req_setupLF(&reqp, aiop, aiocb, 3118 (aio_result_t *)&cbp->aio_resultp, vp); 3119 #endif /* _LP64 */ 3120 if (error) { 3121 releasef(aiocb->aio_fildes); 3122 lio_set_uerror(&cbp->aio_resultp, error); 3123 if (head) { 3124 mutex_enter(&aiop->aio_mutex); 3125 head->lio_nent--; 3126 head->lio_refcnt--; 3127 mutex_exit(&aiop->aio_mutex); 3128 } 3129 aio_errors++; 3130 continue; 3131 } 3132 3133 reqp->aio_req_lio = head; 3134 deadhead = 0; 3135 3136 /* 3137 * Set the errno field now before sending the request to 3138 * the driver to avoid a race condition 3139 */ 3140 (void) suword32(&cbp->aio_resultp.aio_errno, 3141 EINPROGRESS); 3142 3143 reqp->aio_req_iocb.iocb32 = *ucbp; 3144 3145 event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64; 3146 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 3147 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 3148 if (aio_port | aio_thread) { 3149 port_kevent_t *lpkevp; 3150 /* 3151 * Prepare data to send with each aiocb completed. 3152 */ 3153 if (aio_port) { 3154 void *paddr = (void *)(uintptr_t) 3155 aiocb->aio_sigevent.sigev_value.sival_ptr; 3156 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3157 error = EFAULT; 3158 } else { /* aio_thread */ 3159 pnotify.portnfy_port = 3160 aiocb->aio_sigevent.sigev_signo; 3161 pnotify.portnfy_user = 3162 aiocb->aio_sigevent.sigev_value.sival_ptr; 3163 } 3164 if (error) 3165 /* EMPTY */; 3166 else if (pkevtp != NULL && 3167 pnotify.portnfy_port == lio_head_port) 3168 error = port_dup_event(pkevtp, &lpkevp, 3169 PORT_ALLOC_DEFAULT); 3170 else 3171 error = port_alloc_event(pnotify.portnfy_port, 3172 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 3173 &lpkevp); 3174 if (error == 0) { 3175 port_init_event(lpkevp, (uintptr_t)*ucbp, 3176 (void *)(uintptr_t)pnotify.portnfy_user, 3177 aio_port_callback, reqp); 3178 lpkevp->portkev_events = event; 3179 reqp->aio_req_portkev = lpkevp; 3180 reqp->aio_req_port = pnotify.portnfy_port; 3181 } 3182 } 3183 3184 /* 3185 * send the request to driver. 3186 */ 3187 if (error == 0) { 3188 if (aiocb->aio_nbytes == 0) { 3189 clear_active_fd(aiocb->aio_fildes); 3190 aio_zerolen(reqp); 3191 continue; 3192 } 3193 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 3194 CRED()); 3195 } 3196 3197 /* 3198 * the fd's ref count is not decremented until the IO has 3199 * completed unless there was an error. 3200 */ 3201 if (error) { 3202 releasef(aiocb->aio_fildes); 3203 lio_set_uerror(&cbp->aio_resultp, error); 3204 if (head) { 3205 mutex_enter(&aiop->aio_mutex); 3206 head->lio_nent--; 3207 head->lio_refcnt--; 3208 mutex_exit(&aiop->aio_mutex); 3209 } 3210 if (error == ENOTSUP) 3211 aio_notsupported++; 3212 else 3213 aio_errors++; 3214 lio_set_error(reqp); 3215 } else { 3216 clear_active_fd(aiocb->aio_fildes); 3217 } 3218 } 3219 3220 if (aio_notsupported) { 3221 error = ENOTSUP; 3222 } else if (aio_errors) { 3223 /* 3224 * return EIO if any request failed 3225 */ 3226 error = EIO; 3227 } 3228 3229 if (mode_arg == LIO_WAIT) { 3230 mutex_enter(&aiop->aio_mutex); 3231 while (head->lio_refcnt > 0) { 3232 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 3233 mutex_exit(&aiop->aio_mutex); 3234 error = EINTR; 3235 goto done; 3236 } 3237 } 3238 mutex_exit(&aiop->aio_mutex); 3239 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE); 3240 } 3241 3242 done: 3243 kmem_free(cbplist, ssize); 3244 if (deadhead) { 3245 if (head->lio_sigqp) 3246 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 3247 if (head->lio_portkev) 3248 port_free_event(head->lio_portkev); 3249 kmem_free(head, sizeof (aio_lio_t)); 3250 } 3251 return (error); 3252 } 3253 3254 #ifdef _SYSCALL32_IMPL 3255 static void 3256 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest) 3257 { 3258 dest->aio_fildes = src->aio_fildes; 3259 dest->aio_buf = (void *)(uintptr_t)src->aio_buf; 3260 dest->aio_nbytes = (size_t)src->aio_nbytes; 3261 dest->aio_offset = (off_t)src->aio_offset; 3262 dest->aio_reqprio = src->aio_reqprio; 3263 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify; 3264 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo; 3265 3266 /* 3267 * See comment in sigqueue32() on handling of 32-bit 3268 * sigvals in a 64-bit kernel. 3269 */ 3270 dest->aio_sigevent.sigev_value.sival_int = 3271 (int)src->aio_sigevent.sigev_value.sival_int; 3272 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval)) 3273 (uintptr_t)src->aio_sigevent.sigev_notify_function; 3274 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *) 3275 (uintptr_t)src->aio_sigevent.sigev_notify_attributes; 3276 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2; 3277 dest->aio_lio_opcode = src->aio_lio_opcode; 3278 dest->aio_state = src->aio_state; 3279 dest->aio__pad[0] = src->aio__pad[0]; 3280 } 3281 #endif 3282 3283 /* 3284 * This function is used only for largefile calls made by 3285 * 32 bit applications. 3286 */ 3287 static int 3288 aio_req_setupLF( 3289 aio_req_t **reqpp, 3290 aio_t *aiop, 3291 aiocb64_32_t *arg, 3292 aio_result_t *resultp, 3293 vnode_t *vp) 3294 { 3295 sigqueue_t *sqp = NULL; 3296 aio_req_t *reqp; 3297 struct uio *uio; 3298 struct sigevent32 *sigev; 3299 int error; 3300 3301 sigev = &arg->aio_sigevent; 3302 if (sigev->sigev_notify == SIGEV_SIGNAL && 3303 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) { 3304 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3305 if (sqp == NULL) 3306 return (EAGAIN); 3307 sqp->sq_func = NULL; 3308 sqp->sq_next = NULL; 3309 sqp->sq_info.si_code = SI_ASYNCIO; 3310 sqp->sq_info.si_pid = curproc->p_pid; 3311 sqp->sq_info.si_ctid = PRCTID(curproc); 3312 sqp->sq_info.si_zoneid = getzoneid(); 3313 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3314 sqp->sq_info.si_signo = sigev->sigev_signo; 3315 sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int; 3316 } 3317 3318 mutex_enter(&aiop->aio_mutex); 3319 3320 if (aiop->aio_flags & AIO_REQ_BLOCK) { 3321 mutex_exit(&aiop->aio_mutex); 3322 if (sqp) 3323 kmem_free(sqp, sizeof (sigqueue_t)); 3324 return (EIO); 3325 } 3326 /* 3327 * get an aio_reqp from the free list or allocate one 3328 * from dynamic memory. 3329 */ 3330 if (error = aio_req_alloc(&reqp, resultp)) { 3331 mutex_exit(&aiop->aio_mutex); 3332 if (sqp) 3333 kmem_free(sqp, sizeof (sigqueue_t)); 3334 return (error); 3335 } 3336 aiop->aio_pending++; 3337 aiop->aio_outstanding++; 3338 reqp->aio_req_flags = AIO_PENDING; 3339 if (sigev->sigev_notify == SIGEV_THREAD || 3340 sigev->sigev_notify == SIGEV_PORT) 3341 aio_enq(&aiop->aio_portpending, reqp, 0); 3342 mutex_exit(&aiop->aio_mutex); 3343 /* 3344 * initialize aio request. 3345 */ 3346 reqp->aio_req_fd = arg->aio_fildes; 3347 reqp->aio_req_sigqp = sqp; 3348 reqp->aio_req_iocb.iocb = NULL; 3349 reqp->aio_req_lio = NULL; 3350 reqp->aio_req_buf.b_file = vp; 3351 uio = reqp->aio_req.aio_uio; 3352 uio->uio_iovcnt = 1; 3353 uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf; 3354 uio->uio_iov->iov_len = arg->aio_nbytes; 3355 uio->uio_loffset = arg->aio_offset; 3356 *reqpp = reqp; 3357 return (0); 3358 } 3359 3360 /* 3361 * This routine is called when a non largefile call is made by a 32bit 3362 * process on a ILP32 or LP64 kernel. 3363 */ 3364 static int 3365 alio32( 3366 int mode_arg, 3367 void *aiocb_arg, 3368 int nent, 3369 void *sigev) 3370 { 3371 file_t *fp; 3372 file_t *prev_fp = NULL; 3373 int prev_mode = -1; 3374 struct vnode *vp; 3375 aio_lio_t *head; 3376 aio_req_t *reqp; 3377 aio_t *aiop; 3378 caddr_t cbplist; 3379 aiocb_t cb; 3380 aiocb_t *aiocb = &cb; 3381 #ifdef _LP64 3382 aiocb32_t *cbp; 3383 caddr32_t *ucbp; 3384 aiocb32_t cb32; 3385 aiocb32_t *aiocb32 = &cb32; 3386 struct sigevent32 sigevk; 3387 #else 3388 aiocb_t *cbp, **ucbp; 3389 struct sigevent sigevk; 3390 #endif 3391 sigqueue_t *sqp; 3392 int (*aio_func)(); 3393 int mode; 3394 int error = 0; 3395 int aio_errors = 0; 3396 int i; 3397 size_t ssize; 3398 int deadhead = 0; 3399 int aio_notsupported = 0; 3400 int lio_head_port; 3401 int aio_port; 3402 int aio_thread; 3403 port_kevent_t *pkevtp = NULL; 3404 #ifdef _LP64 3405 port_notify32_t pnotify; 3406 #else 3407 port_notify_t pnotify; 3408 #endif 3409 int event; 3410 3411 aiop = curproc->p_aio; 3412 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 3413 return (EINVAL); 3414 3415 #ifdef _LP64 3416 ssize = (sizeof (caddr32_t) * nent); 3417 #else 3418 ssize = (sizeof (aiocb_t *) * nent); 3419 #endif 3420 cbplist = kmem_alloc(ssize, KM_SLEEP); 3421 ucbp = (void *)cbplist; 3422 3423 if (copyin(aiocb_arg, cbplist, ssize) || 3424 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) { 3425 kmem_free(cbplist, ssize); 3426 return (EFAULT); 3427 } 3428 3429 /* Event Ports */ 3430 if (sigev && 3431 (sigevk.sigev_notify == SIGEV_THREAD || 3432 sigevk.sigev_notify == SIGEV_PORT)) { 3433 if (sigevk.sigev_notify == SIGEV_THREAD) { 3434 pnotify.portnfy_port = sigevk.sigev_signo; 3435 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 3436 } else if (copyin( 3437 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr, 3438 &pnotify, sizeof (pnotify))) { 3439 kmem_free(cbplist, ssize); 3440 return (EFAULT); 3441 } 3442 error = port_alloc_event(pnotify.portnfy_port, 3443 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 3444 if (error) { 3445 if (error == ENOMEM || error == EAGAIN) 3446 error = EAGAIN; 3447 else 3448 error = EINVAL; 3449 kmem_free(cbplist, ssize); 3450 return (error); 3451 } 3452 lio_head_port = pnotify.portnfy_port; 3453 } 3454 3455 /* 3456 * a list head should be allocated if notification is 3457 * enabled for this list. 3458 */ 3459 head = NULL; 3460 3461 if (mode_arg == LIO_WAIT || sigev) { 3462 mutex_enter(&aiop->aio_mutex); 3463 error = aio_lio_alloc(&head); 3464 mutex_exit(&aiop->aio_mutex); 3465 if (error) 3466 goto done; 3467 deadhead = 1; 3468 head->lio_nent = nent; 3469 head->lio_refcnt = nent; 3470 head->lio_port = -1; 3471 head->lio_portkev = NULL; 3472 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 3473 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 3474 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3475 if (sqp == NULL) { 3476 error = EAGAIN; 3477 goto done; 3478 } 3479 sqp->sq_func = NULL; 3480 sqp->sq_next = NULL; 3481 sqp->sq_info.si_code = SI_ASYNCIO; 3482 sqp->sq_info.si_pid = curproc->p_pid; 3483 sqp->sq_info.si_ctid = PRCTID(curproc); 3484 sqp->sq_info.si_zoneid = getzoneid(); 3485 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3486 sqp->sq_info.si_signo = sigevk.sigev_signo; 3487 sqp->sq_info.si_value.sival_int = 3488 sigevk.sigev_value.sival_int; 3489 head->lio_sigqp = sqp; 3490 } else { 3491 head->lio_sigqp = NULL; 3492 } 3493 if (pkevtp) { 3494 /* 3495 * Prepare data to send when list of aiocb's has 3496 * completed. 3497 */ 3498 port_init_event(pkevtp, (uintptr_t)sigev, 3499 (void *)(uintptr_t)pnotify.portnfy_user, 3500 NULL, head); 3501 pkevtp->portkev_events = AIOLIO; 3502 head->lio_portkev = pkevtp; 3503 head->lio_port = pnotify.portnfy_port; 3504 } 3505 } 3506 3507 for (i = 0; i < nent; i++, ucbp++) { 3508 3509 /* skip entry if it can't be copied. */ 3510 #ifdef _LP64 3511 cbp = (aiocb32_t *)(uintptr_t)*ucbp; 3512 if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32))) 3513 #else 3514 cbp = (aiocb_t *)*ucbp; 3515 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) 3516 #endif 3517 { 3518 if (head) { 3519 mutex_enter(&aiop->aio_mutex); 3520 head->lio_nent--; 3521 head->lio_refcnt--; 3522 mutex_exit(&aiop->aio_mutex); 3523 } 3524 continue; 3525 } 3526 #ifdef _LP64 3527 /* 3528 * copy 32 bit structure into 64 bit structure 3529 */ 3530 aiocb_32ton(aiocb32, aiocb); 3531 #endif /* _LP64 */ 3532 3533 /* skip if opcode for aiocb is LIO_NOP */ 3534 mode = aiocb->aio_lio_opcode; 3535 if (mode == LIO_NOP) { 3536 cbp = NULL; 3537 if (head) { 3538 mutex_enter(&aiop->aio_mutex); 3539 head->lio_nent--; 3540 head->lio_refcnt--; 3541 mutex_exit(&aiop->aio_mutex); 3542 } 3543 continue; 3544 } 3545 3546 /* increment file descriptor's ref count. */ 3547 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 3548 lio_set_uerror(&cbp->aio_resultp, EBADF); 3549 if (head) { 3550 mutex_enter(&aiop->aio_mutex); 3551 head->lio_nent--; 3552 head->lio_refcnt--; 3553 mutex_exit(&aiop->aio_mutex); 3554 } 3555 aio_errors++; 3556 continue; 3557 } 3558 3559 /* 3560 * check the permission of the partition 3561 */ 3562 if ((fp->f_flag & mode) == 0) { 3563 releasef(aiocb->aio_fildes); 3564 lio_set_uerror(&cbp->aio_resultp, EBADF); 3565 if (head) { 3566 mutex_enter(&aiop->aio_mutex); 3567 head->lio_nent--; 3568 head->lio_refcnt--; 3569 mutex_exit(&aiop->aio_mutex); 3570 } 3571 aio_errors++; 3572 continue; 3573 } 3574 3575 /* 3576 * common case where requests are to the same fd 3577 * for the same r/w operation 3578 * for UFS, need to set EBADFD 3579 */ 3580 vp = fp->f_vnode; 3581 if (fp != prev_fp || mode != prev_mode) { 3582 aio_func = check_vp(vp, mode); 3583 if (aio_func == NULL) { 3584 prev_fp = NULL; 3585 releasef(aiocb->aio_fildes); 3586 lio_set_uerror(&cbp->aio_resultp, EBADFD); 3587 aio_notsupported++; 3588 if (head) { 3589 mutex_enter(&aiop->aio_mutex); 3590 head->lio_nent--; 3591 head->lio_refcnt--; 3592 mutex_exit(&aiop->aio_mutex); 3593 } 3594 continue; 3595 } else { 3596 prev_fp = fp; 3597 prev_mode = mode; 3598 } 3599 } 3600 3601 error = aio_req_setup(&reqp, aiop, aiocb, 3602 (aio_result_t *)&cbp->aio_resultp, vp); 3603 if (error) { 3604 releasef(aiocb->aio_fildes); 3605 lio_set_uerror(&cbp->aio_resultp, error); 3606 if (head) { 3607 mutex_enter(&aiop->aio_mutex); 3608 head->lio_nent--; 3609 head->lio_refcnt--; 3610 mutex_exit(&aiop->aio_mutex); 3611 } 3612 aio_errors++; 3613 continue; 3614 } 3615 3616 reqp->aio_req_lio = head; 3617 deadhead = 0; 3618 3619 /* 3620 * Set the errno field now before sending the request to 3621 * the driver to avoid a race condition 3622 */ 3623 (void) suword32(&cbp->aio_resultp.aio_errno, 3624 EINPROGRESS); 3625 3626 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp; 3627 3628 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE; 3629 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 3630 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 3631 if (aio_port | aio_thread) { 3632 port_kevent_t *lpkevp; 3633 /* 3634 * Prepare data to send with each aiocb completed. 3635 */ 3636 #ifdef _LP64 3637 if (aio_port) { 3638 void *paddr = (void *)(uintptr_t) 3639 aiocb32->aio_sigevent.sigev_value.sival_ptr; 3640 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3641 error = EFAULT; 3642 } else { /* aio_thread */ 3643 pnotify.portnfy_port = 3644 aiocb32->aio_sigevent.sigev_signo; 3645 pnotify.portnfy_user = 3646 aiocb32->aio_sigevent.sigev_value.sival_ptr; 3647 } 3648 #else 3649 if (aio_port) { 3650 void *paddr = 3651 aiocb->aio_sigevent.sigev_value.sival_ptr; 3652 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3653 error = EFAULT; 3654 } else { /* aio_thread */ 3655 pnotify.portnfy_port = 3656 aiocb->aio_sigevent.sigev_signo; 3657 pnotify.portnfy_user = 3658 aiocb->aio_sigevent.sigev_value.sival_ptr; 3659 } 3660 #endif 3661 if (error) 3662 /* EMPTY */; 3663 else if (pkevtp != NULL && 3664 pnotify.portnfy_port == lio_head_port) 3665 error = port_dup_event(pkevtp, &lpkevp, 3666 PORT_ALLOC_DEFAULT); 3667 else 3668 error = port_alloc_event(pnotify.portnfy_port, 3669 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 3670 &lpkevp); 3671 if (error == 0) { 3672 port_init_event(lpkevp, (uintptr_t)cbp, 3673 (void *)(uintptr_t)pnotify.portnfy_user, 3674 aio_port_callback, reqp); 3675 lpkevp->portkev_events = event; 3676 reqp->aio_req_portkev = lpkevp; 3677 reqp->aio_req_port = pnotify.portnfy_port; 3678 } 3679 } 3680 3681 /* 3682 * send the request to driver. 3683 */ 3684 if (error == 0) { 3685 if (aiocb->aio_nbytes == 0) { 3686 clear_active_fd(aiocb->aio_fildes); 3687 aio_zerolen(reqp); 3688 continue; 3689 } 3690 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 3691 CRED()); 3692 } 3693 3694 /* 3695 * the fd's ref count is not decremented until the IO has 3696 * completed unless there was an error. 3697 */ 3698 if (error) { 3699 releasef(aiocb->aio_fildes); 3700 lio_set_uerror(&cbp->aio_resultp, error); 3701 if (head) { 3702 mutex_enter(&aiop->aio_mutex); 3703 head->lio_nent--; 3704 head->lio_refcnt--; 3705 mutex_exit(&aiop->aio_mutex); 3706 } 3707 if (error == ENOTSUP) 3708 aio_notsupported++; 3709 else 3710 aio_errors++; 3711 lio_set_error(reqp); 3712 } else { 3713 clear_active_fd(aiocb->aio_fildes); 3714 } 3715 } 3716 3717 if (aio_notsupported) { 3718 error = ENOTSUP; 3719 } else if (aio_errors) { 3720 /* 3721 * return EIO if any request failed 3722 */ 3723 error = EIO; 3724 } 3725 3726 if (mode_arg == LIO_WAIT) { 3727 mutex_enter(&aiop->aio_mutex); 3728 while (head->lio_refcnt > 0) { 3729 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 3730 mutex_exit(&aiop->aio_mutex); 3731 error = EINTR; 3732 goto done; 3733 } 3734 } 3735 mutex_exit(&aiop->aio_mutex); 3736 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32); 3737 } 3738 3739 done: 3740 kmem_free(cbplist, ssize); 3741 if (deadhead) { 3742 if (head->lio_sigqp) 3743 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 3744 if (head->lio_portkev) 3745 port_free_event(head->lio_portkev); 3746 kmem_free(head, sizeof (aio_lio_t)); 3747 } 3748 return (error); 3749 } 3750 3751 3752 #ifdef _SYSCALL32_IMPL 3753 void 3754 aiocb_32ton(aiocb32_t *src, aiocb_t *dest) 3755 { 3756 dest->aio_fildes = src->aio_fildes; 3757 dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf; 3758 dest->aio_nbytes = (size_t)src->aio_nbytes; 3759 dest->aio_offset = (off_t)src->aio_offset; 3760 dest->aio_reqprio = src->aio_reqprio; 3761 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify; 3762 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo; 3763 3764 /* 3765 * See comment in sigqueue32() on handling of 32-bit 3766 * sigvals in a 64-bit kernel. 3767 */ 3768 dest->aio_sigevent.sigev_value.sival_int = 3769 (int)src->aio_sigevent.sigev_value.sival_int; 3770 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval)) 3771 (uintptr_t)src->aio_sigevent.sigev_notify_function; 3772 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *) 3773 (uintptr_t)src->aio_sigevent.sigev_notify_attributes; 3774 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2; 3775 dest->aio_lio_opcode = src->aio_lio_opcode; 3776 dest->aio_state = src->aio_state; 3777 dest->aio__pad[0] = src->aio__pad[0]; 3778 } 3779 #endif /* _SYSCALL32_IMPL */ 3780 3781 /* 3782 * aio_port_callback() is called just before the event is retrieved from the 3783 * port. The task of this callback function is to finish the work of the 3784 * transaction for the application, it means : 3785 * - copyout transaction data to the application 3786 * (this thread is running in the right process context) 3787 * - keep trace of the transaction (update of counters). 3788 * - free allocated buffers 3789 * The aiocb pointer is the object element of the port_kevent_t structure. 3790 * 3791 * flag : 3792 * PORT_CALLBACK_DEFAULT : do copyout and free resources 3793 * PORT_CALLBACK_CLOSE : don't do copyout, free resources 3794 */ 3795 3796 /*ARGSUSED*/ 3797 int 3798 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp) 3799 { 3800 aio_t *aiop = curproc->p_aio; 3801 aio_req_t *reqp = arg; 3802 struct iovec *iov; 3803 struct buf *bp; 3804 void *resultp; 3805 3806 if (pid != curproc->p_pid) { 3807 /* wrong proc !!, can not deliver data here ... */ 3808 return (EACCES); 3809 } 3810 3811 mutex_enter(&aiop->aio_portq_mutex); 3812 reqp->aio_req_portkev = NULL; 3813 aio_req_remove_portq(aiop, reqp); /* remove request from portq */ 3814 mutex_exit(&aiop->aio_portq_mutex); 3815 aphysio_unlock(reqp); /* unlock used pages */ 3816 mutex_enter(&aiop->aio_mutex); 3817 if (reqp->aio_req_flags & AIO_COPYOUTDONE) { 3818 aio_req_free_port(aiop, reqp); /* back to free list */ 3819 mutex_exit(&aiop->aio_mutex); 3820 return (0); 3821 } 3822 3823 iov = reqp->aio_req_uio.uio_iov; 3824 bp = &reqp->aio_req_buf; 3825 resultp = (void *)reqp->aio_req_resultp; 3826 aio_req_free_port(aiop, reqp); /* request struct back to free list */ 3827 mutex_exit(&aiop->aio_mutex); 3828 if (flag == PORT_CALLBACK_DEFAULT) 3829 aio_copyout_result_port(iov, bp, resultp); 3830 return (0); 3831 } 3832