1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Kernel asynchronous I/O. 29 * This is only for raw devices now (as of Nov. 1993). 30 */ 31 32 #include <sys/types.h> 33 #include <sys/errno.h> 34 #include <sys/conf.h> 35 #include <sys/file.h> 36 #include <sys/fs/snode.h> 37 #include <sys/unistd.h> 38 #include <sys/cmn_err.h> 39 #include <vm/as.h> 40 #include <vm/faultcode.h> 41 #include <sys/sysmacros.h> 42 #include <sys/procfs.h> 43 #include <sys/kmem.h> 44 #include <sys/autoconf.h> 45 #include <sys/ddi_impldefs.h> 46 #include <sys/sunddi.h> 47 #include <sys/aio_impl.h> 48 #include <sys/debug.h> 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/vmsystm.h> 52 #include <sys/fs/pxfs_ki.h> 53 #include <sys/contract/process_impl.h> 54 55 /* 56 * external entry point. 57 */ 58 #ifdef _LP64 59 static int64_t kaioc(long, long, long, long, long, long); 60 #endif 61 static int kaio(ulong_t *, rval_t *); 62 63 64 #define AIO_64 0 65 #define AIO_32 1 66 #define AIO_LARGEFILE 2 67 68 /* 69 * implementation specific functions (private) 70 */ 71 #ifdef _LP64 72 static int alio(int, aiocb_t **, int, struct sigevent *); 73 #endif 74 static int aionotify(void); 75 static int aioinit(void); 76 static int aiostart(void); 77 static void alio_cleanup(aio_t *, aiocb_t **, int, int); 78 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *, 79 cred_t *); 80 static void lio_set_error(aio_req_t *, int portused); 81 static aio_t *aio_aiop_alloc(); 82 static int aio_req_alloc(aio_req_t **, aio_result_t *); 83 static int aio_lio_alloc(aio_lio_t **); 84 static aio_req_t *aio_req_done(void *); 85 static aio_req_t *aio_req_remove(aio_req_t *); 86 static int aio_req_find(aio_result_t *, aio_req_t **); 87 static int aio_hash_insert(struct aio_req_t *, aio_t *); 88 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *, 89 aio_result_t *, vnode_t *, int); 90 static int aio_cleanup_thread(aio_t *); 91 static aio_lio_t *aio_list_get(aio_result_t *); 92 static void lio_set_uerror(void *, int); 93 extern void aio_zerolen(aio_req_t *); 94 static int aiowait(struct timeval *, int, long *); 95 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *); 96 static int aio_unlock_requests(caddr_t iocblist, int iocb_index, 97 aio_req_t *reqlist, aio_t *aiop, model_t model); 98 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max); 99 static int aiosuspend(void *, int, struct timespec *, int, 100 long *, int); 101 static int aliowait(int, void *, int, void *, int); 102 static int aioerror(void *, int); 103 static int aio_cancel(int, void *, long *, int); 104 static int arw(int, int, char *, int, offset_t, aio_result_t *, int); 105 static int aiorw(int, void *, int, int); 106 107 static int alioLF(int, void *, int, void *); 108 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *, 109 aio_result_t *, vnode_t *, int); 110 static int alio32(int, void *, int, void *); 111 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p); 112 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p); 113 114 #ifdef _SYSCALL32_IMPL 115 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *); 116 void aiocb_32ton(aiocb32_t *, aiocb_t *); 117 #endif /* _SYSCALL32_IMPL */ 118 119 /* 120 * implementation specific functions (external) 121 */ 122 void aio_req_free(aio_t *, aio_req_t *); 123 124 /* 125 * Event Port framework 126 */ 127 128 void aio_req_free_port(aio_t *, aio_req_t *); 129 static int aio_port_callback(void *, int *, pid_t, int, void *); 130 131 /* 132 * This is the loadable module wrapper. 133 */ 134 #include <sys/modctl.h> 135 #include <sys/syscall.h> 136 137 #ifdef _LP64 138 139 static struct sysent kaio_sysent = { 140 6, 141 SE_NOUNLOAD | SE_64RVAL | SE_ARGC, 142 (int (*)())kaioc 143 }; 144 145 #ifdef _SYSCALL32_IMPL 146 static struct sysent kaio_sysent32 = { 147 7, 148 SE_NOUNLOAD | SE_64RVAL, 149 kaio 150 }; 151 #endif /* _SYSCALL32_IMPL */ 152 153 #else /* _LP64 */ 154 155 static struct sysent kaio_sysent = { 156 7, 157 SE_NOUNLOAD | SE_32RVAL1, 158 kaio 159 }; 160 161 #endif /* _LP64 */ 162 163 /* 164 * Module linkage information for the kernel. 165 */ 166 167 static struct modlsys modlsys = { 168 &mod_syscallops, 169 "kernel Async I/O", 170 &kaio_sysent 171 }; 172 173 #ifdef _SYSCALL32_IMPL 174 static struct modlsys modlsys32 = { 175 &mod_syscallops32, 176 "kernel Async I/O for 32 bit compatibility", 177 &kaio_sysent32 178 }; 179 #endif /* _SYSCALL32_IMPL */ 180 181 182 static struct modlinkage modlinkage = { 183 MODREV_1, 184 &modlsys, 185 #ifdef _SYSCALL32_IMPL 186 &modlsys32, 187 #endif 188 NULL 189 }; 190 191 int 192 _init(void) 193 { 194 int retval; 195 196 if ((retval = mod_install(&modlinkage)) != 0) 197 return (retval); 198 199 return (0); 200 } 201 202 int 203 _fini(void) 204 { 205 int retval; 206 207 retval = mod_remove(&modlinkage); 208 209 return (retval); 210 } 211 212 int 213 _info(struct modinfo *modinfop) 214 { 215 return (mod_info(&modlinkage, modinfop)); 216 } 217 218 #ifdef _LP64 219 static int64_t 220 kaioc( 221 long a0, 222 long a1, 223 long a2, 224 long a3, 225 long a4, 226 long a5) 227 { 228 int error; 229 long rval = 0; 230 231 switch ((int)a0 & ~AIO_POLL_BIT) { 232 case AIOREAD: 233 error = arw((int)a0, (int)a1, (char *)a2, (int)a3, 234 (offset_t)a4, (aio_result_t *)a5, FREAD); 235 break; 236 case AIOWRITE: 237 error = arw((int)a0, (int)a1, (char *)a2, (int)a3, 238 (offset_t)a4, (aio_result_t *)a5, FWRITE); 239 break; 240 case AIOWAIT: 241 error = aiowait((struct timeval *)a1, (int)a2, &rval); 242 break; 243 case AIOWAITN: 244 error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3, 245 (timespec_t *)a4); 246 break; 247 case AIONOTIFY: 248 error = aionotify(); 249 break; 250 case AIOINIT: 251 error = aioinit(); 252 break; 253 case AIOSTART: 254 error = aiostart(); 255 break; 256 case AIOLIO: 257 error = alio((int)a1, (aiocb_t **)a2, (int)a3, 258 (struct sigevent *)a4); 259 break; 260 case AIOLIOWAIT: 261 error = aliowait((int)a1, (void *)a2, (int)a3, 262 (struct sigevent *)a4, AIO_64); 263 break; 264 case AIOSUSPEND: 265 error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3, 266 (int)a4, &rval, AIO_64); 267 break; 268 case AIOERROR: 269 error = aioerror((void *)a1, AIO_64); 270 break; 271 case AIOAREAD: 272 error = aiorw((int)a0, (void *)a1, FREAD, AIO_64); 273 break; 274 case AIOAWRITE: 275 error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64); 276 break; 277 case AIOCANCEL: 278 error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64); 279 break; 280 281 /* 282 * The large file related stuff is valid only for 283 * 32 bit kernel and not for 64 bit kernel 284 * On 64 bit kernel we convert large file calls 285 * to regular 64bit calls. 286 */ 287 288 default: 289 error = EINVAL; 290 } 291 if (error) 292 return ((int64_t)set_errno(error)); 293 return (rval); 294 } 295 #endif 296 297 static int 298 kaio( 299 ulong_t *uap, 300 rval_t *rvp) 301 { 302 long rval = 0; 303 int error = 0; 304 offset_t off; 305 306 307 rvp->r_vals = 0; 308 #if defined(_LITTLE_ENDIAN) 309 off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4]; 310 #else 311 off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5]; 312 #endif 313 314 switch (uap[0] & ~AIO_POLL_BIT) { 315 /* 316 * It must be the 32 bit system call on 64 bit kernel 317 */ 318 case AIOREAD: 319 return (arw((int)uap[0], (int)uap[1], (char *)uap[2], 320 (int)uap[3], off, (aio_result_t *)uap[6], FREAD)); 321 case AIOWRITE: 322 return (arw((int)uap[0], (int)uap[1], (char *)uap[2], 323 (int)uap[3], off, (aio_result_t *)uap[6], FWRITE)); 324 case AIOWAIT: 325 error = aiowait((struct timeval *)uap[1], (int)uap[2], 326 &rval); 327 break; 328 case AIOWAITN: 329 error = aiowaitn((void *)uap[1], (uint_t)uap[2], 330 (uint_t *)uap[3], (timespec_t *)uap[4]); 331 break; 332 case AIONOTIFY: 333 return (aionotify()); 334 case AIOINIT: 335 return (aioinit()); 336 case AIOSTART: 337 return (aiostart()); 338 case AIOLIO: 339 return (alio32((int)uap[1], (void *)uap[2], (int)uap[3], 340 (void *)uap[4])); 341 case AIOLIOWAIT: 342 return (aliowait((int)uap[1], (void *)uap[2], 343 (int)uap[3], (struct sigevent *)uap[4], AIO_32)); 344 case AIOSUSPEND: 345 error = aiosuspend((void *)uap[1], (int)uap[2], 346 (timespec_t *)uap[3], (int)uap[4], 347 &rval, AIO_32); 348 break; 349 case AIOERROR: 350 return (aioerror((void *)uap[1], AIO_32)); 351 case AIOAREAD: 352 return (aiorw((int)uap[0], (void *)uap[1], 353 FREAD, AIO_32)); 354 case AIOAWRITE: 355 return (aiorw((int)uap[0], (void *)uap[1], 356 FWRITE, AIO_32)); 357 case AIOCANCEL: 358 error = (aio_cancel((int)uap[1], (void *)uap[2], &rval, 359 AIO_32)); 360 break; 361 case AIOLIO64: 362 return (alioLF((int)uap[1], (void *)uap[2], 363 (int)uap[3], (void *)uap[4])); 364 case AIOLIOWAIT64: 365 return (aliowait(uap[1], (void *)uap[2], 366 (int)uap[3], (void *)uap[4], AIO_LARGEFILE)); 367 case AIOSUSPEND64: 368 error = aiosuspend((void *)uap[1], (int)uap[2], 369 (timespec_t *)uap[3], (int)uap[4], &rval, 370 AIO_LARGEFILE); 371 break; 372 case AIOERROR64: 373 return (aioerror((void *)uap[1], AIO_LARGEFILE)); 374 case AIOAREAD64: 375 return (aiorw((int)uap[0], (void *)uap[1], FREAD, 376 AIO_LARGEFILE)); 377 case AIOAWRITE64: 378 return (aiorw((int)uap[0], (void *)uap[1], FWRITE, 379 AIO_LARGEFILE)); 380 case AIOCANCEL64: 381 error = (aio_cancel((int)uap[1], (void *)uap[2], 382 &rval, AIO_LARGEFILE)); 383 break; 384 default: 385 return (EINVAL); 386 } 387 388 rvp->r_val1 = rval; 389 return (error); 390 } 391 392 /* 393 * wake up LWPs in this process that are sleeping in 394 * aiowait(). 395 */ 396 static int 397 aionotify(void) 398 { 399 aio_t *aiop; 400 401 aiop = curproc->p_aio; 402 if (aiop == NULL) 403 return (0); 404 405 mutex_enter(&aiop->aio_mutex); 406 aiop->aio_notifycnt++; 407 cv_broadcast(&aiop->aio_waitcv); 408 mutex_exit(&aiop->aio_mutex); 409 410 return (0); 411 } 412 413 static int 414 timeval2reltime(struct timeval *timout, timestruc_t *rqtime, 415 timestruc_t **rqtp, int *blocking) 416 { 417 #ifdef _SYSCALL32_IMPL 418 struct timeval32 wait_time_32; 419 #endif 420 struct timeval wait_time; 421 model_t model = get_udatamodel(); 422 423 *rqtp = NULL; 424 if (timout == NULL) { /* wait indefinitely */ 425 *blocking = 1; 426 return (0); 427 } 428 429 /* 430 * Need to correctly compare with the -1 passed in for a user 431 * address pointer, with both 32 bit and 64 bit apps. 432 */ 433 if (model == DATAMODEL_NATIVE) { 434 if ((intptr_t)timout == (intptr_t)-1) { /* don't wait */ 435 *blocking = 0; 436 return (0); 437 } 438 439 if (copyin(timout, &wait_time, sizeof (wait_time))) 440 return (EFAULT); 441 } 442 #ifdef _SYSCALL32_IMPL 443 else { 444 /* 445 * -1 from a 32bit app. It will not get sign extended. 446 * don't wait if -1. 447 */ 448 if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) { 449 *blocking = 0; 450 return (0); 451 } 452 453 if (copyin(timout, &wait_time_32, sizeof (wait_time_32))) 454 return (EFAULT); 455 TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32); 456 } 457 #endif /* _SYSCALL32_IMPL */ 458 459 if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) { /* don't wait */ 460 *blocking = 0; 461 return (0); 462 } 463 464 if (wait_time.tv_sec < 0 || 465 wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC) 466 return (EINVAL); 467 468 rqtime->tv_sec = wait_time.tv_sec; 469 rqtime->tv_nsec = wait_time.tv_usec * 1000; 470 *rqtp = rqtime; 471 *blocking = 1; 472 473 return (0); 474 } 475 476 static int 477 timespec2reltime(timespec_t *timout, timestruc_t *rqtime, 478 timestruc_t **rqtp, int *blocking) 479 { 480 #ifdef _SYSCALL32_IMPL 481 timespec32_t wait_time_32; 482 #endif 483 model_t model = get_udatamodel(); 484 485 *rqtp = NULL; 486 if (timout == NULL) { 487 *blocking = 1; 488 return (0); 489 } 490 491 if (model == DATAMODEL_NATIVE) { 492 if (copyin(timout, rqtime, sizeof (*rqtime))) 493 return (EFAULT); 494 } 495 #ifdef _SYSCALL32_IMPL 496 else { 497 if (copyin(timout, &wait_time_32, sizeof (wait_time_32))) 498 return (EFAULT); 499 TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32); 500 } 501 #endif /* _SYSCALL32_IMPL */ 502 503 if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) { 504 *blocking = 0; 505 return (0); 506 } 507 508 if (rqtime->tv_sec < 0 || 509 rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC) 510 return (EINVAL); 511 512 *rqtp = rqtime; 513 *blocking = 1; 514 515 return (0); 516 } 517 518 /*ARGSUSED*/ 519 static int 520 aiowait( 521 struct timeval *timout, 522 int dontblockflg, 523 long *rval) 524 { 525 int error; 526 aio_t *aiop; 527 aio_req_t *reqp; 528 clock_t status; 529 int blocking; 530 int timecheck; 531 timestruc_t rqtime; 532 timestruc_t *rqtp; 533 534 aiop = curproc->p_aio; 535 if (aiop == NULL) 536 return (EINVAL); 537 538 /* 539 * Establish the absolute future time for the timeout. 540 */ 541 error = timeval2reltime(timout, &rqtime, &rqtp, &blocking); 542 if (error) 543 return (error); 544 if (rqtp) { 545 timestruc_t now; 546 timecheck = timechanged; 547 gethrestime(&now); 548 timespecadd(rqtp, &now); 549 } 550 551 mutex_enter(&aiop->aio_mutex); 552 for (;;) { 553 /* process requests on poll queue */ 554 if (aiop->aio_pollq) { 555 mutex_exit(&aiop->aio_mutex); 556 aio_cleanup(0); 557 mutex_enter(&aiop->aio_mutex); 558 } 559 if ((reqp = aio_req_remove(NULL)) != NULL) { 560 *rval = (long)reqp->aio_req_resultp; 561 break; 562 } 563 /* user-level done queue might not be empty */ 564 if (aiop->aio_notifycnt > 0) { 565 aiop->aio_notifycnt--; 566 *rval = 1; 567 break; 568 } 569 /* don't block if no outstanding aio */ 570 if (aiop->aio_outstanding == 0 && dontblockflg) { 571 error = EINVAL; 572 break; 573 } 574 if (blocking) { 575 status = cv_waituntil_sig(&aiop->aio_waitcv, 576 &aiop->aio_mutex, rqtp, timecheck); 577 578 if (status > 0) /* check done queue again */ 579 continue; 580 if (status == 0) { /* interrupted by a signal */ 581 error = EINTR; 582 *rval = -1; 583 } else { /* timer expired */ 584 error = ETIME; 585 } 586 } 587 break; 588 } 589 mutex_exit(&aiop->aio_mutex); 590 if (reqp) { 591 aphysio_unlock(reqp); 592 aio_copyout_result(reqp); 593 mutex_enter(&aiop->aio_mutex); 594 aio_req_free(aiop, reqp); 595 mutex_exit(&aiop->aio_mutex); 596 } 597 return (error); 598 } 599 600 /* 601 * aiowaitn can be used to reap completed asynchronous requests submitted with 602 * lio_listio, aio_read or aio_write. 603 * This function only reaps asynchronous raw I/Os. 604 */ 605 606 /*ARGSUSED*/ 607 static int 608 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout) 609 { 610 int error = 0; 611 aio_t *aiop; 612 aio_req_t *reqlist = NULL; 613 caddr_t iocblist = NULL; /* array of iocb ptr's */ 614 uint_t waitcnt, cnt = 0; /* iocb cnt */ 615 size_t iocbsz; /* users iocb size */ 616 size_t riocbsz; /* returned iocb size */ 617 int iocb_index = 0; 618 model_t model = get_udatamodel(); 619 int blocking = 1; 620 int timecheck; 621 timestruc_t rqtime; 622 timestruc_t *rqtp; 623 624 aiop = curproc->p_aio; 625 if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX) 626 return (EINVAL); 627 628 if (aiop->aio_outstanding == 0) 629 return (EAGAIN); 630 631 if (copyin(nwait, &waitcnt, sizeof (uint_t))) 632 return (EFAULT); 633 634 /* set *nwait to zero, if we must return prematurely */ 635 if (copyout(&cnt, nwait, sizeof (uint_t))) 636 return (EFAULT); 637 638 if (waitcnt == 0) { 639 blocking = 0; 640 rqtp = NULL; 641 waitcnt = nent; 642 } else { 643 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking); 644 if (error) 645 return (error); 646 } 647 648 if (model == DATAMODEL_NATIVE) 649 iocbsz = (sizeof (aiocb_t *) * nent); 650 #ifdef _SYSCALL32_IMPL 651 else 652 iocbsz = (sizeof (caddr32_t) * nent); 653 #endif /* _SYSCALL32_IMPL */ 654 655 /* 656 * Only one aio_waitn call is allowed at a time. 657 * The active aio_waitn will collect all requests 658 * out of the "done" list and if necessary it will wait 659 * for some/all pending requests to fulfill the nwait 660 * parameter. 661 * A second or further aio_waitn calls will sleep here 662 * until the active aio_waitn finishes and leaves the kernel 663 * If the second call does not block (poll), then return 664 * immediately with the error code : EAGAIN. 665 * If the second call should block, then sleep here, but 666 * do not touch the timeout. The timeout starts when this 667 * aio_waitn-call becomes active. 668 */ 669 670 mutex_enter(&aiop->aio_mutex); 671 672 while (aiop->aio_flags & AIO_WAITN) { 673 if (blocking == 0) { 674 mutex_exit(&aiop->aio_mutex); 675 return (EAGAIN); 676 } 677 678 /* block, no timeout */ 679 aiop->aio_flags |= AIO_WAITN_PENDING; 680 if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) { 681 mutex_exit(&aiop->aio_mutex); 682 return (EINTR); 683 } 684 } 685 686 /* 687 * Establish the absolute future time for the timeout. 688 */ 689 if (rqtp) { 690 timestruc_t now; 691 timecheck = timechanged; 692 gethrestime(&now); 693 timespecadd(rqtp, &now); 694 } 695 696 if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) { 697 kmem_free(aiop->aio_iocb, aiop->aio_iocbsz); 698 aiop->aio_iocb = NULL; 699 } 700 701 if (aiop->aio_iocb == NULL) { 702 iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP); 703 if (iocblist == NULL) { 704 mutex_exit(&aiop->aio_mutex); 705 return (ENOMEM); 706 } 707 aiop->aio_iocb = (aiocb_t **)iocblist; 708 aiop->aio_iocbsz = iocbsz; 709 } else { 710 iocblist = (char *)aiop->aio_iocb; 711 } 712 713 aiop->aio_waitncnt = waitcnt; 714 aiop->aio_flags |= AIO_WAITN; 715 716 for (;;) { 717 /* push requests on poll queue to done queue */ 718 if (aiop->aio_pollq) { 719 mutex_exit(&aiop->aio_mutex); 720 aio_cleanup(0); 721 mutex_enter(&aiop->aio_mutex); 722 } 723 724 /* check for requests on done queue */ 725 if (aiop->aio_doneq) { 726 cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt); 727 aiop->aio_waitncnt = waitcnt - cnt; 728 } 729 730 /* user-level done queue might not be empty */ 731 if (aiop->aio_notifycnt > 0) { 732 aiop->aio_notifycnt--; 733 error = 0; 734 break; 735 } 736 737 /* 738 * if we are here second time as a result of timer 739 * expiration, we reset error if there are enough 740 * aiocb's to satisfy request. 741 * We return also if all requests are already done 742 * and we picked up the whole done queue. 743 */ 744 745 if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 && 746 aiop->aio_doneq == NULL)) { 747 error = 0; 748 break; 749 } 750 751 if ((cnt < waitcnt) && blocking) { 752 int rval = cv_waituntil_sig(&aiop->aio_waitcv, 753 &aiop->aio_mutex, rqtp, timecheck); 754 if (rval > 0) 755 continue; 756 if (rval < 0) { 757 error = ETIME; 758 blocking = 0; 759 continue; 760 } 761 error = EINTR; 762 } 763 break; 764 } 765 766 mutex_exit(&aiop->aio_mutex); 767 768 if (cnt > 0) { 769 770 iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist, 771 aiop, model); 772 773 if (model == DATAMODEL_NATIVE) 774 riocbsz = (sizeof (aiocb_t *) * cnt); 775 #ifdef _SYSCALL32_IMPL 776 else 777 riocbsz = (sizeof (caddr32_t) * cnt); 778 #endif /* _SYSCALL32_IMPL */ 779 780 if (copyout(iocblist, uiocb, riocbsz) || 781 copyout(&cnt, nwait, sizeof (uint_t))) 782 error = EFAULT; 783 } 784 785 /* check if there is another thread waiting for execution */ 786 mutex_enter(&aiop->aio_mutex); 787 aiop->aio_flags &= ~AIO_WAITN; 788 if (aiop->aio_flags & AIO_WAITN_PENDING) { 789 aiop->aio_flags &= ~AIO_WAITN_PENDING; 790 cv_signal(&aiop->aio_waitncv); 791 } 792 mutex_exit(&aiop->aio_mutex); 793 794 return (error); 795 } 796 797 /* 798 * aio_unlock_requests 799 * copyouts the result of the request as well as the return value. 800 * It builds the list of completed asynchronous requests, 801 * unlocks the allocated memory ranges and 802 * put the aio request structure back into the free list. 803 */ 804 805 static int 806 aio_unlock_requests( 807 caddr_t iocblist, 808 int iocb_index, 809 aio_req_t *reqlist, 810 aio_t *aiop, 811 model_t model) 812 { 813 aio_req_t *reqp, *nreqp; 814 815 if (model == DATAMODEL_NATIVE) { 816 for (reqp = reqlist; reqp != NULL; reqp = nreqp) { 817 (((caddr_t *)iocblist)[iocb_index++]) = 818 reqp->aio_req_iocb.iocb; 819 nreqp = reqp->aio_req_next; 820 aphysio_unlock(reqp); 821 aio_copyout_result(reqp); 822 mutex_enter(&aiop->aio_mutex); 823 aio_req_free(aiop, reqp); 824 mutex_exit(&aiop->aio_mutex); 825 } 826 } 827 #ifdef _SYSCALL32_IMPL 828 else { 829 for (reqp = reqlist; reqp != NULL; reqp = nreqp) { 830 ((caddr32_t *)iocblist)[iocb_index++] = 831 reqp->aio_req_iocb.iocb32; 832 nreqp = reqp->aio_req_next; 833 aphysio_unlock(reqp); 834 aio_copyout_result(reqp); 835 mutex_enter(&aiop->aio_mutex); 836 aio_req_free(aiop, reqp); 837 mutex_exit(&aiop->aio_mutex); 838 } 839 } 840 #endif /* _SYSCALL32_IMPL */ 841 return (iocb_index); 842 } 843 844 /* 845 * aio_reqlist_concat 846 * moves "max" elements from the done queue to the reqlist queue and removes 847 * the AIO_DONEQ flag. 848 * - reqlist queue is a simple linked list 849 * - done queue is a double linked list 850 */ 851 852 static int 853 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max) 854 { 855 aio_req_t *q2, *q2work, *list; 856 int count = 0; 857 858 list = *reqlist; 859 q2 = aiop->aio_doneq; 860 q2work = q2; 861 while (max-- > 0) { 862 q2work->aio_req_flags &= ~AIO_DONEQ; 863 q2work = q2work->aio_req_next; 864 count++; 865 if (q2work == q2) 866 break; 867 } 868 869 if (q2work == q2) { 870 /* all elements revised */ 871 q2->aio_req_prev->aio_req_next = list; 872 list = q2; 873 aiop->aio_doneq = NULL; 874 } else { 875 /* 876 * max < elements in the doneq 877 * detach only the required amount of elements 878 * out of the doneq 879 */ 880 q2work->aio_req_prev->aio_req_next = list; 881 list = q2; 882 883 aiop->aio_doneq = q2work; 884 q2work->aio_req_prev = q2->aio_req_prev; 885 q2->aio_req_prev->aio_req_next = q2work; 886 } 887 *reqlist = list; 888 return (count); 889 } 890 891 /*ARGSUSED*/ 892 static int 893 aiosuspend( 894 void *aiocb, 895 int nent, 896 struct timespec *timout, 897 int flag, 898 long *rval, 899 int run_mode) 900 { 901 int error; 902 aio_t *aiop; 903 aio_req_t *reqp, *found, *next; 904 caddr_t cbplist = NULL; 905 aiocb_t *cbp, **ucbp; 906 #ifdef _SYSCALL32_IMPL 907 aiocb32_t *cbp32; 908 caddr32_t *ucbp32; 909 #endif /* _SYSCALL32_IMPL */ 910 aiocb64_32_t *cbp64; 911 int rv; 912 int i; 913 size_t ssize; 914 model_t model = get_udatamodel(); 915 int blocking; 916 int timecheck; 917 timestruc_t rqtime; 918 timestruc_t *rqtp; 919 920 aiop = curproc->p_aio; 921 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 922 return (EINVAL); 923 924 /* 925 * Establish the absolute future time for the timeout. 926 */ 927 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking); 928 if (error) 929 return (error); 930 if (rqtp) { 931 timestruc_t now; 932 timecheck = timechanged; 933 gethrestime(&now); 934 timespecadd(rqtp, &now); 935 } 936 937 /* 938 * If we are not blocking and there's no IO complete 939 * skip aiocb copyin. 940 */ 941 if (!blocking && (aiop->aio_pollq == NULL) && 942 (aiop->aio_doneq == NULL)) { 943 return (EAGAIN); 944 } 945 946 if (model == DATAMODEL_NATIVE) 947 ssize = (sizeof (aiocb_t *) * nent); 948 #ifdef _SYSCALL32_IMPL 949 else 950 ssize = (sizeof (caddr32_t) * nent); 951 #endif /* _SYSCALL32_IMPL */ 952 953 cbplist = kmem_alloc(ssize, KM_NOSLEEP); 954 if (cbplist == NULL) 955 return (ENOMEM); 956 957 if (copyin(aiocb, cbplist, ssize)) { 958 error = EFAULT; 959 goto done; 960 } 961 962 found = NULL; 963 /* 964 * we need to get the aio_cleanupq_mutex since we call 965 * aio_req_done(). 966 */ 967 mutex_enter(&aiop->aio_cleanupq_mutex); 968 mutex_enter(&aiop->aio_mutex); 969 for (;;) { 970 /* push requests on poll queue to done queue */ 971 if (aiop->aio_pollq) { 972 mutex_exit(&aiop->aio_mutex); 973 mutex_exit(&aiop->aio_cleanupq_mutex); 974 aio_cleanup(0); 975 mutex_enter(&aiop->aio_cleanupq_mutex); 976 mutex_enter(&aiop->aio_mutex); 977 } 978 /* check for requests on done queue */ 979 if (aiop->aio_doneq) { 980 if (model == DATAMODEL_NATIVE) 981 ucbp = (aiocb_t **)cbplist; 982 #ifdef _SYSCALL32_IMPL 983 else 984 ucbp32 = (caddr32_t *)cbplist; 985 #endif /* _SYSCALL32_IMPL */ 986 for (i = 0; i < nent; i++) { 987 if (model == DATAMODEL_NATIVE) { 988 if ((cbp = *ucbp++) == NULL) 989 continue; 990 if (run_mode != AIO_LARGEFILE) 991 reqp = aio_req_done( 992 &cbp->aio_resultp); 993 else { 994 cbp64 = (aiocb64_32_t *)cbp; 995 reqp = aio_req_done( 996 &cbp64->aio_resultp); 997 } 998 } 999 #ifdef _SYSCALL32_IMPL 1000 else { 1001 if (run_mode == AIO_32) { 1002 if ((cbp32 = 1003 (aiocb32_t *)(uintptr_t) 1004 *ucbp32++) == NULL) 1005 continue; 1006 reqp = aio_req_done( 1007 &cbp32->aio_resultp); 1008 } else if (run_mode == AIO_LARGEFILE) { 1009 if ((cbp64 = 1010 (aiocb64_32_t *)(uintptr_t) 1011 *ucbp32++) == NULL) 1012 continue; 1013 reqp = aio_req_done( 1014 &cbp64->aio_resultp); 1015 } 1016 1017 } 1018 #endif /* _SYSCALL32_IMPL */ 1019 if (reqp) { 1020 reqp->aio_req_next = found; 1021 found = reqp; 1022 } 1023 if (aiop->aio_doneq == NULL) 1024 break; 1025 } 1026 if (found) 1027 break; 1028 } 1029 if (aiop->aio_notifycnt > 0) { 1030 /* 1031 * nothing on the kernel's queue. the user 1032 * has notified the kernel that it has items 1033 * on a user-level queue. 1034 */ 1035 aiop->aio_notifycnt--; 1036 *rval = 1; 1037 error = 0; 1038 break; 1039 } 1040 /* don't block if nothing is outstanding */ 1041 if (aiop->aio_outstanding == 0) { 1042 error = EAGAIN; 1043 break; 1044 } 1045 if (blocking) { 1046 /* 1047 * drop the aio_cleanupq_mutex as we are 1048 * going to block. 1049 */ 1050 mutex_exit(&aiop->aio_cleanupq_mutex); 1051 rv = cv_waituntil_sig(&aiop->aio_waitcv, 1052 &aiop->aio_mutex, rqtp, timecheck); 1053 /* 1054 * we have to drop aio_mutex and 1055 * grab it in the right order. 1056 */ 1057 mutex_exit(&aiop->aio_mutex); 1058 mutex_enter(&aiop->aio_cleanupq_mutex); 1059 mutex_enter(&aiop->aio_mutex); 1060 if (rv > 0) /* check done queue again */ 1061 continue; 1062 if (rv == 0) /* interrupted by a signal */ 1063 error = EINTR; 1064 else /* timer expired */ 1065 error = ETIME; 1066 } else { 1067 error = EAGAIN; 1068 } 1069 break; 1070 } 1071 mutex_exit(&aiop->aio_mutex); 1072 mutex_exit(&aiop->aio_cleanupq_mutex); 1073 for (reqp = found; reqp != NULL; reqp = next) { 1074 next = reqp->aio_req_next; 1075 aphysio_unlock(reqp); 1076 aio_copyout_result(reqp); 1077 mutex_enter(&aiop->aio_mutex); 1078 aio_req_free(aiop, reqp); 1079 mutex_exit(&aiop->aio_mutex); 1080 } 1081 done: 1082 kmem_free(cbplist, ssize); 1083 return (error); 1084 } 1085 1086 /* 1087 * initialize aio by allocating an aio_t struct for this 1088 * process. 1089 */ 1090 static int 1091 aioinit(void) 1092 { 1093 proc_t *p = curproc; 1094 aio_t *aiop; 1095 mutex_enter(&p->p_lock); 1096 if ((aiop = p->p_aio) == NULL) { 1097 aiop = aio_aiop_alloc(); 1098 p->p_aio = aiop; 1099 } 1100 mutex_exit(&p->p_lock); 1101 if (aiop == NULL) 1102 return (ENOMEM); 1103 return (0); 1104 } 1105 1106 /* 1107 * start a special thread that will cleanup after aio requests 1108 * that are preventing a segment from being unmapped. as_unmap() 1109 * blocks until all phsyio to this segment is completed. this 1110 * doesn't happen until all the pages in this segment are not 1111 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio 1112 * requests still outstanding. this special thread will make sure 1113 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed. 1114 * 1115 * this function will return an error if the process has only 1116 * one LWP. the assumption is that the caller is a separate LWP 1117 * that remains blocked in the kernel for the life of this process. 1118 */ 1119 static int 1120 aiostart(void) 1121 { 1122 proc_t *p = curproc; 1123 aio_t *aiop; 1124 int first, error = 0; 1125 1126 if (p->p_lwpcnt == 1) 1127 return (EDEADLK); 1128 mutex_enter(&p->p_lock); 1129 if ((aiop = p->p_aio) == NULL) 1130 error = EINVAL; 1131 else { 1132 first = aiop->aio_ok; 1133 if (aiop->aio_ok == 0) 1134 aiop->aio_ok = 1; 1135 } 1136 mutex_exit(&p->p_lock); 1137 if (error == 0 && first == 0) { 1138 return (aio_cleanup_thread(aiop)); 1139 /* should return only to exit */ 1140 } 1141 return (error); 1142 } 1143 1144 /* 1145 * Associate an aiocb with a port. 1146 * This function is used by aiorw() to associate a transaction with a port. 1147 * Allocate an event port structure (port_alloc_event()) and store the 1148 * delivered user pointer (portnfy_user) in the portkev_user field of the 1149 * port_kevent_t structure.. 1150 * The aio_req_portkev pointer in the aio_req_t structure was added to identify 1151 * the port association. 1152 */ 1153 1154 static int 1155 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp, 1156 aio_req_t *reqp, int event) 1157 { 1158 port_kevent_t *pkevp = NULL; 1159 int error; 1160 1161 error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT, 1162 PORT_SOURCE_AIO, &pkevp); 1163 if (error) { 1164 if ((error == ENOMEM) || (error == EAGAIN)) 1165 error = EAGAIN; 1166 else 1167 error = EINVAL; 1168 } else { 1169 port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user, 1170 aio_port_callback, reqp); 1171 pkevp->portkev_events = event; 1172 reqp->aio_req_portkev = pkevp; 1173 reqp->aio_req_port = pntfy->portnfy_port; 1174 } 1175 return (error); 1176 } 1177 1178 #ifdef _LP64 1179 1180 /* 1181 * Asynchronous list IO. A chain of aiocb's are copied in 1182 * one at a time. If the aiocb is invalid, it is skipped. 1183 * For each aiocb, the appropriate driver entry point is 1184 * called. Optimize for the common case where the list 1185 * of requests is to the same file descriptor. 1186 * 1187 * One possible optimization is to define a new driver entry 1188 * point that supports a list of IO requests. Whether this 1189 * improves performance depends somewhat on the driver's 1190 * locking strategy. Processing a list could adversely impact 1191 * the driver's interrupt latency. 1192 */ 1193 static int 1194 alio( 1195 int mode_arg, 1196 aiocb_t **aiocb_arg, 1197 int nent, 1198 struct sigevent *sigev) 1199 { 1200 file_t *fp; 1201 file_t *prev_fp = NULL; 1202 int prev_mode = -1; 1203 struct vnode *vp; 1204 aio_lio_t *head; 1205 aio_req_t *reqp; 1206 aio_t *aiop; 1207 caddr_t cbplist; 1208 aiocb_t cb; 1209 aiocb_t *aiocb = &cb; 1210 aiocb_t *cbp; 1211 aiocb_t **ucbp; 1212 struct sigevent sigevk; 1213 sigqueue_t *sqp; 1214 int (*aio_func)(); 1215 int mode; 1216 int error = 0; 1217 int aio_errors = 0; 1218 int i; 1219 size_t ssize; 1220 int deadhead = 0; 1221 int aio_notsupported = 0; 1222 int lio_head_port; 1223 int aio_port; 1224 int aio_thread; 1225 port_kevent_t *pkevtp = NULL; 1226 int portused = 0; 1227 port_notify_t pnotify; 1228 int event; 1229 1230 aiop = curproc->p_aio; 1231 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 1232 return (EINVAL); 1233 1234 ssize = (sizeof (aiocb_t *) * nent); 1235 cbplist = kmem_alloc(ssize, KM_SLEEP); 1236 ucbp = (aiocb_t **)cbplist; 1237 1238 if (copyin(aiocb_arg, cbplist, ssize) || 1239 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) { 1240 kmem_free(cbplist, ssize); 1241 return (EFAULT); 1242 } 1243 1244 /* Event Ports */ 1245 if (sigev && 1246 (sigevk.sigev_notify == SIGEV_THREAD || 1247 sigevk.sigev_notify == SIGEV_PORT)) { 1248 if (sigevk.sigev_notify == SIGEV_THREAD) { 1249 pnotify.portnfy_port = sigevk.sigev_signo; 1250 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 1251 } else if (copyin(sigevk.sigev_value.sival_ptr, 1252 &pnotify, sizeof (pnotify))) { 1253 kmem_free(cbplist, ssize); 1254 return (EFAULT); 1255 } 1256 error = port_alloc_event(pnotify.portnfy_port, 1257 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 1258 if (error) { 1259 if (error == ENOMEM || error == EAGAIN) 1260 error = EAGAIN; 1261 else 1262 error = EINVAL; 1263 kmem_free(cbplist, ssize); 1264 return (error); 1265 } 1266 lio_head_port = pnotify.portnfy_port; 1267 portused = 1; 1268 } 1269 1270 /* 1271 * a list head should be allocated if notification is 1272 * enabled for this list. 1273 */ 1274 head = NULL; 1275 1276 if (mode_arg == LIO_WAIT || sigev) { 1277 mutex_enter(&aiop->aio_mutex); 1278 error = aio_lio_alloc(&head); 1279 mutex_exit(&aiop->aio_mutex); 1280 if (error) 1281 goto done; 1282 deadhead = 1; 1283 head->lio_nent = nent; 1284 head->lio_refcnt = nent; 1285 head->lio_port = -1; 1286 head->lio_portkev = NULL; 1287 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 1288 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 1289 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 1290 if (sqp == NULL) { 1291 error = EAGAIN; 1292 goto done; 1293 } 1294 sqp->sq_func = NULL; 1295 sqp->sq_next = NULL; 1296 sqp->sq_info.si_code = SI_ASYNCIO; 1297 sqp->sq_info.si_pid = curproc->p_pid; 1298 sqp->sq_info.si_ctid = PRCTID(curproc); 1299 sqp->sq_info.si_zoneid = getzoneid(); 1300 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 1301 sqp->sq_info.si_signo = sigevk.sigev_signo; 1302 sqp->sq_info.si_value = sigevk.sigev_value; 1303 head->lio_sigqp = sqp; 1304 } else { 1305 head->lio_sigqp = NULL; 1306 } 1307 if (pkevtp) { 1308 /* 1309 * Prepare data to send when list of aiocb's 1310 * has completed. 1311 */ 1312 port_init_event(pkevtp, (uintptr_t)sigev, 1313 (void *)(uintptr_t)pnotify.portnfy_user, 1314 NULL, head); 1315 pkevtp->portkev_events = AIOLIO; 1316 head->lio_portkev = pkevtp; 1317 head->lio_port = pnotify.portnfy_port; 1318 } 1319 } 1320 1321 for (i = 0; i < nent; i++, ucbp++) { 1322 1323 cbp = *ucbp; 1324 /* skip entry if it can't be copied. */ 1325 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) { 1326 if (head) { 1327 mutex_enter(&aiop->aio_mutex); 1328 head->lio_nent--; 1329 head->lio_refcnt--; 1330 mutex_exit(&aiop->aio_mutex); 1331 } 1332 continue; 1333 } 1334 1335 /* skip if opcode for aiocb is LIO_NOP */ 1336 mode = aiocb->aio_lio_opcode; 1337 if (mode == LIO_NOP) { 1338 cbp = NULL; 1339 if (head) { 1340 mutex_enter(&aiop->aio_mutex); 1341 head->lio_nent--; 1342 head->lio_refcnt--; 1343 mutex_exit(&aiop->aio_mutex); 1344 } 1345 continue; 1346 } 1347 1348 /* increment file descriptor's ref count. */ 1349 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 1350 lio_set_uerror(&cbp->aio_resultp, EBADF); 1351 if (head) { 1352 mutex_enter(&aiop->aio_mutex); 1353 head->lio_nent--; 1354 head->lio_refcnt--; 1355 mutex_exit(&aiop->aio_mutex); 1356 } 1357 aio_errors++; 1358 continue; 1359 } 1360 1361 /* 1362 * check the permission of the partition 1363 */ 1364 if ((fp->f_flag & mode) == 0) { 1365 releasef(aiocb->aio_fildes); 1366 lio_set_uerror(&cbp->aio_resultp, EBADF); 1367 if (head) { 1368 mutex_enter(&aiop->aio_mutex); 1369 head->lio_nent--; 1370 head->lio_refcnt--; 1371 mutex_exit(&aiop->aio_mutex); 1372 } 1373 aio_errors++; 1374 continue; 1375 } 1376 1377 /* 1378 * common case where requests are to the same fd 1379 * for the same r/w operation. 1380 * for UFS, need to set EBADFD 1381 */ 1382 vp = fp->f_vnode; 1383 if (fp != prev_fp || mode != prev_mode) { 1384 aio_func = check_vp(vp, mode); 1385 if (aio_func == NULL) { 1386 prev_fp = NULL; 1387 releasef(aiocb->aio_fildes); 1388 lio_set_uerror(&cbp->aio_resultp, EBADFD); 1389 aio_notsupported++; 1390 if (head) { 1391 mutex_enter(&aiop->aio_mutex); 1392 head->lio_nent--; 1393 head->lio_refcnt--; 1394 mutex_exit(&aiop->aio_mutex); 1395 } 1396 continue; 1397 } else { 1398 prev_fp = fp; 1399 prev_mode = mode; 1400 } 1401 } 1402 1403 error = aio_req_setup(&reqp, aiop, aiocb, 1404 &cbp->aio_resultp, vp, 0); 1405 if (error) { 1406 releasef(aiocb->aio_fildes); 1407 lio_set_uerror(&cbp->aio_resultp, error); 1408 if (head) { 1409 mutex_enter(&aiop->aio_mutex); 1410 head->lio_nent--; 1411 head->lio_refcnt--; 1412 mutex_exit(&aiop->aio_mutex); 1413 } 1414 aio_errors++; 1415 continue; 1416 } 1417 1418 reqp->aio_req_lio = head; 1419 deadhead = 0; 1420 1421 /* 1422 * Set the errno field now before sending the request to 1423 * the driver to avoid a race condition 1424 */ 1425 (void) suword32(&cbp->aio_resultp.aio_errno, 1426 EINPROGRESS); 1427 1428 reqp->aio_req_iocb.iocb = (caddr_t)cbp; 1429 1430 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE; 1431 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 1432 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 1433 if (aio_port | aio_thread) { 1434 port_kevent_t *lpkevp; 1435 /* 1436 * Prepare data to send with each aiocb completed. 1437 */ 1438 if (aio_port) { 1439 void *paddr = 1440 aiocb->aio_sigevent.sigev_value.sival_ptr; 1441 if (copyin(paddr, &pnotify, sizeof (pnotify))) 1442 error = EFAULT; 1443 } else { /* aio_thread */ 1444 pnotify.portnfy_port = 1445 aiocb->aio_sigevent.sigev_signo; 1446 pnotify.portnfy_user = 1447 aiocb->aio_sigevent.sigev_value.sival_ptr; 1448 } 1449 if (error) 1450 /* EMPTY */; 1451 else if (pkevtp != NULL && 1452 pnotify.portnfy_port == lio_head_port) 1453 error = port_dup_event(pkevtp, &lpkevp, 1454 PORT_ALLOC_DEFAULT); 1455 else 1456 error = port_alloc_event(pnotify.portnfy_port, 1457 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 1458 &lpkevp); 1459 if (error == 0) { 1460 port_init_event(lpkevp, (uintptr_t)cbp, 1461 (void *)(uintptr_t)pnotify.portnfy_user, 1462 aio_port_callback, reqp); 1463 lpkevp->portkev_events = event; 1464 reqp->aio_req_portkev = lpkevp; 1465 reqp->aio_req_port = pnotify.portnfy_port; 1466 } 1467 } 1468 1469 /* 1470 * send the request to driver. 1471 */ 1472 if (error == 0) { 1473 if (aiocb->aio_nbytes == 0) { 1474 clear_active_fd(aiocb->aio_fildes); 1475 aio_zerolen(reqp); 1476 continue; 1477 } 1478 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 1479 CRED()); 1480 } 1481 1482 /* 1483 * the fd's ref count is not decremented until the IO has 1484 * completed unless there was an error. 1485 */ 1486 if (error) { 1487 releasef(aiocb->aio_fildes); 1488 lio_set_uerror(&cbp->aio_resultp, error); 1489 if (head) { 1490 mutex_enter(&aiop->aio_mutex); 1491 head->lio_nent--; 1492 head->lio_refcnt--; 1493 mutex_exit(&aiop->aio_mutex); 1494 } 1495 if (error == ENOTSUP) 1496 aio_notsupported++; 1497 else 1498 aio_errors++; 1499 lio_set_error(reqp, portused); 1500 } else { 1501 clear_active_fd(aiocb->aio_fildes); 1502 } 1503 } 1504 1505 if (aio_notsupported) { 1506 error = ENOTSUP; 1507 } else if (aio_errors) { 1508 /* 1509 * return EIO if any request failed 1510 */ 1511 error = EIO; 1512 } 1513 1514 if (mode_arg == LIO_WAIT) { 1515 mutex_enter(&aiop->aio_mutex); 1516 while (head->lio_refcnt > 0) { 1517 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 1518 mutex_exit(&aiop->aio_mutex); 1519 error = EINTR; 1520 goto done; 1521 } 1522 } 1523 mutex_exit(&aiop->aio_mutex); 1524 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64); 1525 } 1526 1527 done: 1528 kmem_free(cbplist, ssize); 1529 if (deadhead) { 1530 if (head->lio_sigqp) 1531 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 1532 if (head->lio_portkev) 1533 port_free_event(head->lio_portkev); 1534 kmem_free(head, sizeof (aio_lio_t)); 1535 } 1536 return (error); 1537 } 1538 1539 #endif /* _LP64 */ 1540 1541 /* 1542 * Asynchronous list IO. 1543 * If list I/O is called with LIO_WAIT it can still return 1544 * before all the I/O's are completed if a signal is caught 1545 * or if the list include UFS I/O requests. If this happens, 1546 * libaio will call aliowait() to wait for the I/O's to 1547 * complete 1548 */ 1549 /*ARGSUSED*/ 1550 static int 1551 aliowait( 1552 int mode, 1553 void *aiocb, 1554 int nent, 1555 void *sigev, 1556 int run_mode) 1557 { 1558 aio_lio_t *head; 1559 aio_t *aiop; 1560 caddr_t cbplist; 1561 aiocb_t *cbp, **ucbp; 1562 #ifdef _SYSCALL32_IMPL 1563 aiocb32_t *cbp32; 1564 caddr32_t *ucbp32; 1565 aiocb64_32_t *cbp64; 1566 #endif 1567 int error = 0; 1568 int i; 1569 size_t ssize = 0; 1570 model_t model = get_udatamodel(); 1571 1572 aiop = curproc->p_aio; 1573 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 1574 return (EINVAL); 1575 1576 if (model == DATAMODEL_NATIVE) 1577 ssize = (sizeof (aiocb_t *) * nent); 1578 #ifdef _SYSCALL32_IMPL 1579 else 1580 ssize = (sizeof (caddr32_t) * nent); 1581 #endif /* _SYSCALL32_IMPL */ 1582 1583 if (ssize == 0) 1584 return (EINVAL); 1585 1586 cbplist = kmem_alloc(ssize, KM_SLEEP); 1587 1588 if (model == DATAMODEL_NATIVE) 1589 ucbp = (aiocb_t **)cbplist; 1590 #ifdef _SYSCALL32_IMPL 1591 else 1592 ucbp32 = (caddr32_t *)cbplist; 1593 #endif /* _SYSCALL32_IMPL */ 1594 1595 if (copyin(aiocb, cbplist, ssize)) { 1596 error = EFAULT; 1597 goto done; 1598 } 1599 1600 /* 1601 * To find the list head, we go through the 1602 * list of aiocb structs, find the request 1603 * its for, then get the list head that reqp 1604 * points to 1605 */ 1606 head = NULL; 1607 1608 for (i = 0; i < nent; i++) { 1609 if (model == DATAMODEL_NATIVE) { 1610 /* 1611 * Since we are only checking for a NULL pointer 1612 * Following should work on both native data sizes 1613 * as well as for largefile aiocb. 1614 */ 1615 if ((cbp = *ucbp++) == NULL) 1616 continue; 1617 if (run_mode != AIO_LARGEFILE) 1618 if (head = aio_list_get(&cbp->aio_resultp)) 1619 break; 1620 else { 1621 /* 1622 * This is a case when largefile call is 1623 * made on 32 bit kernel. 1624 * Treat each pointer as pointer to 1625 * aiocb64_32 1626 */ 1627 if (head = aio_list_get((aio_result_t *) 1628 &(((aiocb64_32_t *)cbp)->aio_resultp))) 1629 break; 1630 } 1631 } 1632 #ifdef _SYSCALL32_IMPL 1633 else { 1634 if (run_mode == AIO_LARGEFILE) { 1635 if ((cbp64 = (aiocb64_32_t *) 1636 (uintptr_t)*ucbp32++) == NULL) 1637 continue; 1638 if (head = aio_list_get((aio_result_t *) 1639 &cbp64->aio_resultp)) 1640 break; 1641 } else if (run_mode == AIO_32) { 1642 if ((cbp32 = (aiocb32_t *) 1643 (uintptr_t)*ucbp32++) == NULL) 1644 continue; 1645 if (head = aio_list_get((aio_result_t *) 1646 &cbp32->aio_resultp)) 1647 break; 1648 } 1649 } 1650 #endif /* _SYSCALL32_IMPL */ 1651 } 1652 1653 if (head == NULL) { 1654 error = EINVAL; 1655 goto done; 1656 } 1657 1658 mutex_enter(&aiop->aio_mutex); 1659 while (head->lio_refcnt > 0) { 1660 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 1661 mutex_exit(&aiop->aio_mutex); 1662 error = EINTR; 1663 goto done; 1664 } 1665 } 1666 mutex_exit(&aiop->aio_mutex); 1667 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode); 1668 done: 1669 kmem_free(cbplist, ssize); 1670 return (error); 1671 } 1672 1673 aio_lio_t * 1674 aio_list_get(aio_result_t *resultp) 1675 { 1676 aio_lio_t *head = NULL; 1677 aio_t *aiop; 1678 aio_req_t **bucket; 1679 aio_req_t *reqp; 1680 long index; 1681 1682 aiop = curproc->p_aio; 1683 if (aiop == NULL) 1684 return (NULL); 1685 1686 if (resultp) { 1687 index = AIO_HASH(resultp); 1688 bucket = &aiop->aio_hash[index]; 1689 for (reqp = *bucket; reqp != NULL; 1690 reqp = reqp->aio_hash_next) { 1691 if (reqp->aio_req_resultp == resultp) { 1692 head = reqp->aio_req_lio; 1693 return (head); 1694 } 1695 } 1696 } 1697 return (NULL); 1698 } 1699 1700 1701 static void 1702 lio_set_uerror(void *resultp, int error) 1703 { 1704 /* 1705 * the resultp field is a pointer to where the 1706 * error should be written out to the user's 1707 * aiocb. 1708 * 1709 */ 1710 if (get_udatamodel() == DATAMODEL_NATIVE) { 1711 (void) sulword(&((aio_result_t *)resultp)->aio_return, 1712 (ssize_t)-1); 1713 (void) suword32(&((aio_result_t *)resultp)->aio_errno, error); 1714 } 1715 #ifdef _SYSCALL32_IMPL 1716 else { 1717 (void) suword32(&((aio_result32_t *)resultp)->aio_return, 1718 (uint_t)-1); 1719 (void) suword32(&((aio_result32_t *)resultp)->aio_errno, error); 1720 } 1721 #endif /* _SYSCALL32_IMPL */ 1722 } 1723 1724 /* 1725 * do cleanup completion for all requests in list. memory for 1726 * each request is also freed. 1727 */ 1728 static void 1729 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode) 1730 { 1731 int i; 1732 aio_req_t *reqp; 1733 aio_result_t *resultp; 1734 aiocb64_32_t *aiocb_64; 1735 1736 for (i = 0; i < nent; i++) { 1737 if (get_udatamodel() == DATAMODEL_NATIVE) { 1738 if (cbp[i] == NULL) 1739 continue; 1740 if (run_mode == AIO_LARGEFILE) { 1741 aiocb_64 = (aiocb64_32_t *)cbp[i]; 1742 resultp = (aio_result_t *) 1743 &aiocb_64->aio_resultp; 1744 } else 1745 resultp = &cbp[i]->aio_resultp; 1746 } 1747 #ifdef _SYSCALL32_IMPL 1748 else { 1749 aiocb32_t *aiocb_32; 1750 caddr32_t *cbp32; 1751 1752 cbp32 = (caddr32_t *)cbp; 1753 if (cbp32[i] == NULL) 1754 continue; 1755 if (run_mode == AIO_32) { 1756 aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i]; 1757 resultp = (aio_result_t *)&aiocb_32-> 1758 aio_resultp; 1759 } else if (run_mode == AIO_LARGEFILE) { 1760 aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i]; 1761 resultp = (aio_result_t *)&aiocb_64-> 1762 aio_resultp; 1763 } 1764 } 1765 #endif /* _SYSCALL32_IMPL */ 1766 /* 1767 * we need to get the aio_cleanupq_mutex since we call 1768 * aio_req_done(). 1769 */ 1770 mutex_enter(&aiop->aio_cleanupq_mutex); 1771 mutex_enter(&aiop->aio_mutex); 1772 reqp = aio_req_done(resultp); 1773 mutex_exit(&aiop->aio_mutex); 1774 mutex_exit(&aiop->aio_cleanupq_mutex); 1775 if (reqp != NULL) { 1776 aphysio_unlock(reqp); 1777 aio_copyout_result(reqp); 1778 mutex_enter(&aiop->aio_mutex); 1779 aio_req_free(aiop, reqp); 1780 mutex_exit(&aiop->aio_mutex); 1781 } 1782 } 1783 } 1784 1785 /* 1786 * Write out the results for an aio request that is done. 1787 */ 1788 static int 1789 aioerror(void *cb, int run_mode) 1790 { 1791 aio_result_t *resultp; 1792 aio_t *aiop; 1793 aio_req_t *reqp; 1794 int retval; 1795 1796 aiop = curproc->p_aio; 1797 if (aiop == NULL || cb == NULL) 1798 return (EINVAL); 1799 1800 if (get_udatamodel() == DATAMODEL_NATIVE) { 1801 if (run_mode == AIO_LARGEFILE) 1802 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)-> 1803 aio_resultp; 1804 else 1805 resultp = &((aiocb_t *)cb)->aio_resultp; 1806 } 1807 #ifdef _SYSCALL32_IMPL 1808 else { 1809 if (run_mode == AIO_LARGEFILE) 1810 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)-> 1811 aio_resultp; 1812 else if (run_mode == AIO_32) 1813 resultp = (aio_result_t *)&((aiocb32_t *)cb)-> 1814 aio_resultp; 1815 } 1816 #endif /* _SYSCALL32_IMPL */ 1817 /* 1818 * we need to get the aio_cleanupq_mutex since we call 1819 * aio_req_find(). 1820 */ 1821 mutex_enter(&aiop->aio_cleanupq_mutex); 1822 mutex_enter(&aiop->aio_mutex); 1823 retval = aio_req_find(resultp, &reqp); 1824 mutex_exit(&aiop->aio_mutex); 1825 mutex_exit(&aiop->aio_cleanupq_mutex); 1826 if (retval == 0) { 1827 aphysio_unlock(reqp); 1828 aio_copyout_result(reqp); 1829 mutex_enter(&aiop->aio_mutex); 1830 aio_req_free(aiop, reqp); 1831 mutex_exit(&aiop->aio_mutex); 1832 return (0); 1833 } else if (retval == 1) 1834 return (EINPROGRESS); 1835 else if (retval == 2) 1836 return (EINVAL); 1837 return (0); 1838 } 1839 1840 /* 1841 * aio_cancel - if no requests outstanding, 1842 * return AIO_ALLDONE 1843 * else 1844 * return AIO_NOTCANCELED 1845 */ 1846 static int 1847 aio_cancel( 1848 int fildes, 1849 void *cb, 1850 long *rval, 1851 int run_mode) 1852 { 1853 aio_t *aiop; 1854 void *resultp; 1855 int index; 1856 aio_req_t **bucket; 1857 aio_req_t *ent; 1858 1859 1860 /* 1861 * Verify valid file descriptor 1862 */ 1863 if ((getf(fildes)) == NULL) { 1864 return (EBADF); 1865 } 1866 releasef(fildes); 1867 1868 aiop = curproc->p_aio; 1869 if (aiop == NULL) 1870 return (EINVAL); 1871 1872 if (aiop->aio_outstanding == 0) { 1873 *rval = AIO_ALLDONE; 1874 return (0); 1875 } 1876 1877 mutex_enter(&aiop->aio_mutex); 1878 if (cb != NULL) { 1879 if (get_udatamodel() == DATAMODEL_NATIVE) { 1880 if (run_mode == AIO_LARGEFILE) 1881 resultp = (aio_result_t *)&((aiocb64_32_t *)cb) 1882 ->aio_resultp; 1883 else 1884 resultp = &((aiocb_t *)cb)->aio_resultp; 1885 } 1886 #ifdef _SYSCALL32_IMPL 1887 else { 1888 if (run_mode == AIO_LARGEFILE) 1889 resultp = (aio_result_t *)&((aiocb64_32_t *)cb) 1890 ->aio_resultp; 1891 else if (run_mode == AIO_32) 1892 resultp = (aio_result_t *)&((aiocb32_t *)cb) 1893 ->aio_resultp; 1894 } 1895 #endif /* _SYSCALL32_IMPL */ 1896 index = AIO_HASH(resultp); 1897 bucket = &aiop->aio_hash[index]; 1898 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 1899 if (ent->aio_req_resultp == resultp) { 1900 if ((ent->aio_req_flags & AIO_PENDING) == 0) { 1901 mutex_exit(&aiop->aio_mutex); 1902 *rval = AIO_ALLDONE; 1903 return (0); 1904 } 1905 mutex_exit(&aiop->aio_mutex); 1906 *rval = AIO_NOTCANCELED; 1907 return (0); 1908 } 1909 } 1910 mutex_exit(&aiop->aio_mutex); 1911 *rval = AIO_ALLDONE; 1912 return (0); 1913 } 1914 1915 for (index = 0; index < AIO_HASHSZ; index++) { 1916 bucket = &aiop->aio_hash[index]; 1917 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 1918 if (ent->aio_req_fd == fildes) { 1919 if ((ent->aio_req_flags & AIO_PENDING) != 0) { 1920 mutex_exit(&aiop->aio_mutex); 1921 *rval = AIO_NOTCANCELED; 1922 return (0); 1923 } 1924 } 1925 } 1926 } 1927 mutex_exit(&aiop->aio_mutex); 1928 *rval = AIO_ALLDONE; 1929 return (0); 1930 } 1931 1932 /* 1933 * solaris version of asynchronous read and write 1934 */ 1935 static int 1936 arw( 1937 int opcode, 1938 int fdes, 1939 char *bufp, 1940 int bufsize, 1941 offset_t offset, 1942 aio_result_t *resultp, 1943 int mode) 1944 { 1945 file_t *fp; 1946 int error; 1947 struct vnode *vp; 1948 aio_req_t *reqp; 1949 aio_t *aiop; 1950 int (*aio_func)(); 1951 #ifdef _LP64 1952 aiocb_t aiocb; 1953 #else 1954 aiocb64_32_t aiocb64; 1955 #endif 1956 1957 aiop = curproc->p_aio; 1958 if (aiop == NULL) 1959 return (EINVAL); 1960 1961 if ((fp = getf(fdes)) == NULL) { 1962 return (EBADF); 1963 } 1964 1965 /* 1966 * check the permission of the partition 1967 */ 1968 if ((fp->f_flag & mode) == 0) { 1969 releasef(fdes); 1970 return (EBADF); 1971 } 1972 1973 vp = fp->f_vnode; 1974 aio_func = check_vp(vp, mode); 1975 if (aio_func == NULL) { 1976 releasef(fdes); 1977 return (EBADFD); 1978 } 1979 #ifdef _LP64 1980 aiocb.aio_fildes = fdes; 1981 aiocb.aio_buf = bufp; 1982 aiocb.aio_nbytes = bufsize; 1983 aiocb.aio_offset = offset; 1984 aiocb.aio_sigevent.sigev_notify = 0; 1985 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1); 1986 #else 1987 aiocb64.aio_fildes = fdes; 1988 aiocb64.aio_buf = (caddr32_t)bufp; 1989 aiocb64.aio_nbytes = bufsize; 1990 aiocb64.aio_offset = offset; 1991 aiocb64.aio_sigevent.sigev_notify = 0; 1992 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1); 1993 #endif 1994 if (error) { 1995 releasef(fdes); 1996 return (error); 1997 } 1998 1999 /* 2000 * enable polling on this request if the opcode has 2001 * the AIO poll bit set 2002 */ 2003 if (opcode & AIO_POLL_BIT) 2004 reqp->aio_req_flags |= AIO_POLL; 2005 2006 if (bufsize == 0) { 2007 clear_active_fd(fdes); 2008 aio_zerolen(reqp); 2009 return (0); 2010 } 2011 /* 2012 * send the request to driver. 2013 */ 2014 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED()); 2015 /* 2016 * the fd is stored in the aio_req_t by aio_req_setup(), and 2017 * is released by the aio_cleanup_thread() when the IO has 2018 * completed. 2019 */ 2020 if (error) { 2021 releasef(fdes); 2022 mutex_enter(&aiop->aio_mutex); 2023 aio_req_free(aiop, reqp); 2024 aiop->aio_pending--; 2025 if (aiop->aio_flags & AIO_REQ_BLOCK) 2026 cv_signal(&aiop->aio_cleanupcv); 2027 mutex_exit(&aiop->aio_mutex); 2028 return (error); 2029 } 2030 clear_active_fd(fdes); 2031 return (0); 2032 } 2033 2034 /* 2035 * posix version of asynchronous read and write 2036 */ 2037 static int 2038 aiorw( 2039 int opcode, 2040 void *aiocb_arg, 2041 int mode, 2042 int run_mode) 2043 { 2044 #ifdef _SYSCALL32_IMPL 2045 aiocb32_t aiocb32; 2046 struct sigevent32 *sigev32; 2047 port_notify32_t pntfy32; 2048 #endif 2049 aiocb64_32_t aiocb64; 2050 aiocb_t aiocb; 2051 file_t *fp; 2052 int error, fd; 2053 size_t bufsize; 2054 struct vnode *vp; 2055 aio_req_t *reqp; 2056 aio_t *aiop; 2057 int (*aio_func)(); 2058 aio_result_t *resultp; 2059 struct sigevent *sigev; 2060 model_t model; 2061 int aio_use_port = 0; 2062 port_notify_t pntfy; 2063 2064 model = get_udatamodel(); 2065 aiop = curproc->p_aio; 2066 if (aiop == NULL) 2067 return (EINVAL); 2068 2069 if (model == DATAMODEL_NATIVE) { 2070 if (run_mode != AIO_LARGEFILE) { 2071 if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t))) 2072 return (EFAULT); 2073 bufsize = aiocb.aio_nbytes; 2074 resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp); 2075 if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) { 2076 return (EBADF); 2077 } 2078 sigev = &aiocb.aio_sigevent; 2079 } else { 2080 /* 2081 * We come here only when we make largefile 2082 * call on 32 bit kernel using 32 bit library. 2083 */ 2084 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t))) 2085 return (EFAULT); 2086 bufsize = aiocb64.aio_nbytes; 2087 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg) 2088 ->aio_resultp); 2089 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) 2090 return (EBADF); 2091 sigev = (struct sigevent *)&aiocb64.aio_sigevent; 2092 } 2093 2094 if (sigev->sigev_notify == SIGEV_PORT) { 2095 if (copyin((void *)sigev->sigev_value.sival_ptr, 2096 &pntfy, sizeof (port_notify_t))) { 2097 releasef(fd); 2098 return (EFAULT); 2099 } 2100 aio_use_port = 1; 2101 } else if (sigev->sigev_notify == SIGEV_THREAD) { 2102 pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo; 2103 pntfy.portnfy_user = 2104 aiocb.aio_sigevent.sigev_value.sival_ptr; 2105 aio_use_port = 1; 2106 } 2107 } 2108 #ifdef _SYSCALL32_IMPL 2109 else { 2110 if (run_mode == AIO_32) { 2111 /* 32 bit system call is being made on 64 bit kernel */ 2112 if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t))) 2113 return (EFAULT); 2114 2115 bufsize = aiocb32.aio_nbytes; 2116 aiocb_32ton(&aiocb32, &aiocb); 2117 resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)-> 2118 aio_resultp); 2119 if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) { 2120 return (EBADF); 2121 } 2122 sigev32 = &aiocb32.aio_sigevent; 2123 } else if (run_mode == AIO_LARGEFILE) { 2124 /* 2125 * We come here only when we make largefile 2126 * call on 64 bit kernel using 32 bit library. 2127 */ 2128 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t))) 2129 return (EFAULT); 2130 bufsize = aiocb64.aio_nbytes; 2131 aiocb_LFton(&aiocb64, &aiocb); 2132 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg) 2133 ->aio_resultp); 2134 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) 2135 return (EBADF); 2136 sigev32 = &aiocb64.aio_sigevent; 2137 } 2138 2139 if (sigev32->sigev_notify == SIGEV_PORT) { 2140 if (copyin( 2141 (void *)(uintptr_t)sigev32->sigev_value.sival_ptr, 2142 &pntfy32, sizeof (port_notify32_t))) { 2143 releasef(fd); 2144 return (EFAULT); 2145 } 2146 pntfy.portnfy_port = pntfy32.portnfy_port; 2147 pntfy.portnfy_user = (void *)(uintptr_t) 2148 pntfy32.portnfy_user; 2149 aio_use_port = 1; 2150 } else if (sigev32->sigev_notify == SIGEV_THREAD) { 2151 pntfy.portnfy_port = sigev32->sigev_signo; 2152 pntfy.portnfy_user = (void *)(uintptr_t) 2153 sigev32->sigev_value.sival_ptr; 2154 aio_use_port = 1; 2155 } 2156 } 2157 #endif /* _SYSCALL32_IMPL */ 2158 2159 /* 2160 * check the permission of the partition 2161 */ 2162 2163 if ((fp->f_flag & mode) == 0) { 2164 releasef(fd); 2165 return (EBADF); 2166 } 2167 2168 vp = fp->f_vnode; 2169 aio_func = check_vp(vp, mode); 2170 if (aio_func == NULL) { 2171 releasef(fd); 2172 return (EBADFD); 2173 } 2174 if (run_mode == AIO_LARGEFILE) 2175 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0); 2176 else 2177 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0); 2178 2179 if (error) { 2180 releasef(fd); 2181 return (error); 2182 } 2183 /* 2184 * enable polling on this request if the opcode has 2185 * the AIO poll bit set 2186 */ 2187 if (opcode & AIO_POLL_BIT) 2188 reqp->aio_req_flags |= AIO_POLL; 2189 2190 if (model == DATAMODEL_NATIVE) 2191 reqp->aio_req_iocb.iocb = aiocb_arg; 2192 #ifdef _SYSCALL32_IMPL 2193 else 2194 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg; 2195 #endif 2196 2197 if (aio_use_port) { 2198 int event = (run_mode == AIO_LARGEFILE)? 2199 ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) : 2200 ((mode == FREAD)? AIOAREAD : AIOAWRITE); 2201 error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event); 2202 } 2203 2204 /* 2205 * send the request to driver. 2206 */ 2207 if (error == 0) { 2208 if (bufsize == 0) { 2209 clear_active_fd(fd); 2210 aio_zerolen(reqp); 2211 return (0); 2212 } 2213 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED()); 2214 } 2215 2216 /* 2217 * the fd is stored in the aio_req_t by aio_req_setup(), and 2218 * is released by the aio_cleanup_thread() when the IO has 2219 * completed. 2220 */ 2221 if (error) { 2222 releasef(fd); 2223 mutex_enter(&aiop->aio_mutex); 2224 if (aio_use_port) 2225 aio_deq(&aiop->aio_portpending, reqp); 2226 aio_req_free(aiop, reqp); 2227 aiop->aio_pending--; 2228 if (aiop->aio_flags & AIO_REQ_BLOCK) 2229 cv_signal(&aiop->aio_cleanupcv); 2230 mutex_exit(&aiop->aio_mutex); 2231 return (error); 2232 } 2233 clear_active_fd(fd); 2234 return (0); 2235 } 2236 2237 2238 /* 2239 * set error for a list IO entry that failed. 2240 */ 2241 static void 2242 lio_set_error(aio_req_t *reqp, int portused) 2243 { 2244 aio_t *aiop = curproc->p_aio; 2245 2246 if (aiop == NULL) 2247 return; 2248 2249 mutex_enter(&aiop->aio_mutex); 2250 if (portused) 2251 aio_deq(&aiop->aio_portpending, reqp); 2252 aiop->aio_pending--; 2253 /* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */ 2254 reqp->aio_req_flags |= AIO_PHYSIODONE; 2255 /* 2256 * Need to free the request now as its never 2257 * going to get on the done queue 2258 * 2259 * Note: aio_outstanding is decremented in 2260 * aio_req_free() 2261 */ 2262 aio_req_free(aiop, reqp); 2263 if (aiop->aio_flags & AIO_REQ_BLOCK) 2264 cv_signal(&aiop->aio_cleanupcv); 2265 mutex_exit(&aiop->aio_mutex); 2266 } 2267 2268 /* 2269 * check if a specified request is done, and remove it from 2270 * the done queue. otherwise remove anybody from the done queue 2271 * if NULL is specified. 2272 */ 2273 static aio_req_t * 2274 aio_req_done(void *resultp) 2275 { 2276 aio_req_t **bucket; 2277 aio_req_t *ent; 2278 aio_t *aiop = curproc->p_aio; 2279 long index; 2280 2281 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex)); 2282 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2283 2284 if (resultp) { 2285 index = AIO_HASH(resultp); 2286 bucket = &aiop->aio_hash[index]; 2287 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 2288 if (ent->aio_req_resultp == (aio_result_t *)resultp) { 2289 if (ent->aio_req_flags & AIO_DONEQ) { 2290 return (aio_req_remove(ent)); 2291 } 2292 return (NULL); 2293 } 2294 } 2295 /* no match, resultp is invalid */ 2296 return (NULL); 2297 } 2298 return (aio_req_remove(NULL)); 2299 } 2300 2301 /* 2302 * determine if a user-level resultp pointer is associated with an 2303 * active IO request. Zero is returned when the request is done, 2304 * and the request is removed from the done queue. Only when the 2305 * return value is zero, is the "reqp" pointer valid. One is returned 2306 * when the request is inprogress. Two is returned when the request 2307 * is invalid. 2308 */ 2309 static int 2310 aio_req_find(aio_result_t *resultp, aio_req_t **reqp) 2311 { 2312 aio_req_t **bucket; 2313 aio_req_t *ent; 2314 aio_t *aiop = curproc->p_aio; 2315 long index; 2316 2317 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex)); 2318 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2319 2320 index = AIO_HASH(resultp); 2321 bucket = &aiop->aio_hash[index]; 2322 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 2323 if (ent->aio_req_resultp == resultp) { 2324 if (ent->aio_req_flags & AIO_DONEQ) { 2325 *reqp = aio_req_remove(ent); 2326 return (0); 2327 } 2328 return (1); 2329 } 2330 } 2331 /* no match, resultp is invalid */ 2332 return (2); 2333 } 2334 2335 /* 2336 * remove a request from the done queue. 2337 */ 2338 static aio_req_t * 2339 aio_req_remove(aio_req_t *reqp) 2340 { 2341 aio_t *aiop = curproc->p_aio; 2342 2343 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2344 2345 if (reqp != NULL) { 2346 ASSERT(reqp->aio_req_flags & AIO_DONEQ); 2347 if (reqp->aio_req_next == reqp) { 2348 /* only one request on queue */ 2349 if (reqp == aiop->aio_doneq) { 2350 aiop->aio_doneq = NULL; 2351 } else { 2352 ASSERT(reqp == aiop->aio_cleanupq); 2353 aiop->aio_cleanupq = NULL; 2354 } 2355 } else { 2356 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev; 2357 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next; 2358 /* 2359 * The request can be either on the aio_doneq or the 2360 * aio_cleanupq 2361 */ 2362 if (reqp == aiop->aio_doneq) 2363 aiop->aio_doneq = reqp->aio_req_next; 2364 2365 if (reqp == aiop->aio_cleanupq) 2366 aiop->aio_cleanupq = reqp->aio_req_next; 2367 } 2368 reqp->aio_req_flags &= ~AIO_DONEQ; 2369 reqp->aio_req_next = NULL; 2370 reqp->aio_req_prev = NULL; 2371 } else if ((reqp = aiop->aio_doneq) != NULL) { 2372 ASSERT(reqp->aio_req_flags & AIO_DONEQ); 2373 if (reqp == reqp->aio_req_next) { 2374 /* only one request on queue */ 2375 aiop->aio_doneq = NULL; 2376 } else { 2377 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next; 2378 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev; 2379 aiop->aio_doneq = reqp->aio_req_next; 2380 } 2381 reqp->aio_req_flags &= ~AIO_DONEQ; 2382 reqp->aio_req_next = NULL; 2383 reqp->aio_req_prev = NULL; 2384 } 2385 if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN)) 2386 cv_broadcast(&aiop->aio_waitcv); 2387 return (reqp); 2388 } 2389 2390 static int 2391 aio_req_setup( 2392 aio_req_t **reqpp, 2393 aio_t *aiop, 2394 aiocb_t *arg, 2395 aio_result_t *resultp, 2396 vnode_t *vp, 2397 int old_solaris_req) 2398 { 2399 sigqueue_t *sqp = NULL; 2400 aio_req_t *reqp; 2401 struct uio *uio; 2402 struct sigevent *sigev; 2403 int error; 2404 2405 sigev = &arg->aio_sigevent; 2406 if (sigev->sigev_notify == SIGEV_SIGNAL && 2407 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) { 2408 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 2409 if (sqp == NULL) 2410 return (EAGAIN); 2411 sqp->sq_func = NULL; 2412 sqp->sq_next = NULL; 2413 sqp->sq_info.si_code = SI_ASYNCIO; 2414 sqp->sq_info.si_pid = curproc->p_pid; 2415 sqp->sq_info.si_ctid = PRCTID(curproc); 2416 sqp->sq_info.si_zoneid = getzoneid(); 2417 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 2418 sqp->sq_info.si_signo = sigev->sigev_signo; 2419 sqp->sq_info.si_value = sigev->sigev_value; 2420 } 2421 2422 mutex_enter(&aiop->aio_mutex); 2423 2424 if (aiop->aio_flags & AIO_REQ_BLOCK) { 2425 mutex_exit(&aiop->aio_mutex); 2426 if (sqp) 2427 kmem_free(sqp, sizeof (sigqueue_t)); 2428 return (EIO); 2429 } 2430 /* 2431 * get an aio_reqp from the free list or allocate one 2432 * from dynamic memory. 2433 */ 2434 if (error = aio_req_alloc(&reqp, resultp)) { 2435 mutex_exit(&aiop->aio_mutex); 2436 if (sqp) 2437 kmem_free(sqp, sizeof (sigqueue_t)); 2438 return (error); 2439 } 2440 aiop->aio_pending++; 2441 aiop->aio_outstanding++; 2442 reqp->aio_req_flags = AIO_PENDING; 2443 if (old_solaris_req) { 2444 /* this is an old solaris aio request */ 2445 reqp->aio_req_flags |= AIO_SOLARIS; 2446 aiop->aio_flags |= AIO_SOLARIS_REQ; 2447 } 2448 if (sigev->sigev_notify == SIGEV_THREAD || 2449 sigev->sigev_notify == SIGEV_PORT) 2450 aio_enq(&aiop->aio_portpending, reqp, 0); 2451 mutex_exit(&aiop->aio_mutex); 2452 /* 2453 * initialize aio request. 2454 */ 2455 reqp->aio_req_fd = arg->aio_fildes; 2456 reqp->aio_req_sigqp = sqp; 2457 reqp->aio_req_iocb.iocb = NULL; 2458 reqp->aio_req_lio = NULL; 2459 reqp->aio_req_buf.b_file = vp; 2460 uio = reqp->aio_req.aio_uio; 2461 uio->uio_iovcnt = 1; 2462 uio->uio_iov->iov_base = (caddr_t)arg->aio_buf; 2463 uio->uio_iov->iov_len = arg->aio_nbytes; 2464 uio->uio_loffset = arg->aio_offset; 2465 *reqpp = reqp; 2466 return (0); 2467 } 2468 2469 /* 2470 * Allocate p_aio struct. 2471 */ 2472 static aio_t * 2473 aio_aiop_alloc(void) 2474 { 2475 aio_t *aiop; 2476 2477 ASSERT(MUTEX_HELD(&curproc->p_lock)); 2478 2479 aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP); 2480 if (aiop) { 2481 mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL); 2482 mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT, 2483 NULL); 2484 mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL); 2485 } 2486 return (aiop); 2487 } 2488 2489 /* 2490 * Allocate an aio_req struct. 2491 */ 2492 static int 2493 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp) 2494 { 2495 aio_req_t *reqp; 2496 aio_t *aiop = curproc->p_aio; 2497 2498 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2499 2500 if ((reqp = aiop->aio_free) != NULL) { 2501 aiop->aio_free = reqp->aio_req_next; 2502 bzero(reqp, sizeof (*reqp)); 2503 } else { 2504 /* 2505 * Check whether memory is getting tight. 2506 * This is a temporary mechanism to avoid memory 2507 * exhaustion by a single process until we come up 2508 * with a per process solution such as setrlimit(). 2509 */ 2510 if (freemem < desfree) 2511 return (EAGAIN); 2512 reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP); 2513 if (reqp == NULL) 2514 return (EAGAIN); 2515 } 2516 reqp->aio_req.aio_uio = &reqp->aio_req_uio; 2517 reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov; 2518 reqp->aio_req.aio_private = reqp; 2519 reqp->aio_req_buf.b_offset = -1; 2520 reqp->aio_req_resultp = resultp; 2521 if (aio_hash_insert(reqp, aiop)) { 2522 reqp->aio_req_next = aiop->aio_free; 2523 aiop->aio_free = reqp; 2524 return (EBUSY); 2525 } 2526 *nreqp = reqp; 2527 return (0); 2528 } 2529 2530 /* 2531 * Allocate an aio_lio_t struct. 2532 */ 2533 static int 2534 aio_lio_alloc(aio_lio_t **head) 2535 { 2536 aio_lio_t *liop; 2537 aio_t *aiop = curproc->p_aio; 2538 2539 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2540 2541 if ((liop = aiop->aio_lio_free) != NULL) { 2542 aiop->aio_lio_free = liop->lio_next; 2543 } else { 2544 /* 2545 * Check whether memory is getting tight. 2546 * This is a temporary mechanism to avoid memory 2547 * exhaustion by a single process until we come up 2548 * with a per process solution such as setrlimit(). 2549 */ 2550 if (freemem < desfree) 2551 return (EAGAIN); 2552 2553 liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP); 2554 if (liop == NULL) 2555 return (EAGAIN); 2556 } 2557 *head = liop; 2558 return (0); 2559 } 2560 2561 /* 2562 * this is a special per-process thread that is only activated if 2563 * the process is unmapping a segment with outstanding aio. normally, 2564 * the process will have completed the aio before unmapping the 2565 * segment. If the process does unmap a segment with outstanding aio, 2566 * this special thread will guarentee that the locked pages due to 2567 * aphysio() are released, thereby permitting the segment to be 2568 * unmapped. In addition to this, the cleanup thread is woken up 2569 * during DR operations to release the locked pages. 2570 */ 2571 2572 static int 2573 aio_cleanup_thread(aio_t *aiop) 2574 { 2575 proc_t *p = curproc; 2576 struct as *as = p->p_as; 2577 int poked = 0; 2578 kcondvar_t *cvp; 2579 int exit_flag = 0; 2580 int rqclnup = 0; 2581 2582 sigfillset(&curthread->t_hold); 2583 sigdiffset(&curthread->t_hold, &cantmask); 2584 for (;;) { 2585 /* 2586 * if a segment is being unmapped, and the current 2587 * process's done queue is not empty, then every request 2588 * on the doneq with locked resources should be forced 2589 * to release their locks. By moving the doneq request 2590 * to the cleanupq, aio_cleanup() will process the cleanupq, 2591 * and place requests back onto the doneq. All requests 2592 * processed by aio_cleanup() will have their physical 2593 * resources unlocked. 2594 */ 2595 mutex_enter(&aiop->aio_mutex); 2596 if ((aiop->aio_flags & AIO_CLEANUP) == 0) { 2597 aiop->aio_flags |= AIO_CLEANUP; 2598 mutex_enter(&as->a_contents); 2599 if (aiop->aio_rqclnup) { 2600 aiop->aio_rqclnup = 0; 2601 rqclnup = 1; 2602 } 2603 mutex_exit(&as->a_contents); 2604 if (aiop->aio_doneq) { 2605 aio_req_t *doneqhead = aiop->aio_doneq; 2606 aiop->aio_doneq = NULL; 2607 aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ); 2608 } 2609 } 2610 mutex_exit(&aiop->aio_mutex); 2611 aio_cleanup(AIO_CLEANUP_THREAD); 2612 /* 2613 * thread should block on the cleanupcv while 2614 * AIO_CLEANUP is set. 2615 */ 2616 cvp = &aiop->aio_cleanupcv; 2617 mutex_enter(&aiop->aio_mutex); 2618 2619 if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL || 2620 aiop->aio_notifyq != NULL || 2621 aiop->aio_portcleanupq != NULL) { 2622 mutex_exit(&aiop->aio_mutex); 2623 continue; 2624 } 2625 mutex_enter(&as->a_contents); 2626 2627 /* 2628 * AIO_CLEANUP determines when the cleanup thread 2629 * should be active. This flag is set when 2630 * the cleanup thread is awakened by as_unmap() or 2631 * due to DR operations. 2632 * The flag is cleared when the blocking as_unmap() 2633 * that originally awakened us is allowed to 2634 * complete. as_unmap() blocks when trying to 2635 * unmap a segment that has SOFTLOCKed pages. when 2636 * the segment's pages are all SOFTUNLOCKed, 2637 * as->a_flags & AS_UNMAPWAIT should be zero. 2638 * 2639 * In case of cleanup request by DR, the flag is cleared 2640 * once all the pending aio requests have been processed. 2641 * 2642 * The flag shouldn't be cleared right away if the 2643 * cleanup thread was interrupted because the process 2644 * is doing forkall(). This happens when cv_wait_sig() 2645 * returns zero, because it was awakened by a pokelwps(). 2646 * If the process is not exiting, it must be doing forkall(). 2647 */ 2648 if ((poked == 0) && 2649 ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) || 2650 (aiop->aio_pending == 0))) { 2651 aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT); 2652 cvp = &as->a_cv; 2653 rqclnup = 0; 2654 } 2655 mutex_exit(&aiop->aio_mutex); 2656 if (poked) { 2657 /* 2658 * If the process is exiting/killed, don't return 2659 * immediately without waiting for pending I/O's 2660 * and releasing the page locks. 2661 */ 2662 if (p->p_flag & (SEXITLWPS|SKILLED)) { 2663 /* 2664 * If exit_flag is set, then it is 2665 * safe to exit because we have released 2666 * page locks of completed I/O's. 2667 */ 2668 if (exit_flag) 2669 break; 2670 2671 mutex_exit(&as->a_contents); 2672 2673 /* 2674 * Wait for all the pending aio to complete. 2675 */ 2676 mutex_enter(&aiop->aio_mutex); 2677 aiop->aio_flags |= AIO_REQ_BLOCK; 2678 while (aiop->aio_pending != 0) 2679 cv_wait(&aiop->aio_cleanupcv, 2680 &aiop->aio_mutex); 2681 mutex_exit(&aiop->aio_mutex); 2682 exit_flag = 1; 2683 continue; 2684 } else if (p->p_flag & 2685 (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) { 2686 /* 2687 * hold LWP until it 2688 * is continued. 2689 */ 2690 mutex_exit(&as->a_contents); 2691 mutex_enter(&p->p_lock); 2692 stop(PR_SUSPENDED, SUSPEND_NORMAL); 2693 mutex_exit(&p->p_lock); 2694 poked = 0; 2695 continue; 2696 } 2697 } else { 2698 /* 2699 * When started this thread will sleep on as->a_cv. 2700 * as_unmap will awake this thread if the 2701 * segment has SOFTLOCKed pages (poked = 0). 2702 * 1. pokelwps() awakes this thread => 2703 * break the loop to check SEXITLWPS, SHOLDFORK, etc 2704 * 2. as_unmap awakes this thread => 2705 * to break the loop it is necessary that 2706 * - AS_UNMAPWAIT is set (as_unmap is waiting for 2707 * memory to be unlocked) 2708 * - AIO_CLEANUP is not set 2709 * (if AIO_CLEANUP is set we have to wait for 2710 * pending requests. aio_done will send a signal 2711 * for every request which completes to continue 2712 * unmapping the corresponding address range) 2713 * 3. A cleanup request will wake this thread up, ex. 2714 * by the DR operations. The aio_rqclnup flag will 2715 * be set. 2716 */ 2717 while (poked == 0) { 2718 /* 2719 * The clean up requests that came in 2720 * after we had just cleaned up, couldn't 2721 * be causing the unmap thread to block - as 2722 * unmap event happened first. 2723 * Let aio_done() wake us up if it sees a need. 2724 */ 2725 if (aiop->aio_rqclnup && 2726 (aiop->aio_flags & AIO_CLEANUP) == 0) 2727 break; 2728 poked = !cv_wait_sig(cvp, &as->a_contents); 2729 if (AS_ISUNMAPWAIT(as) == 0) 2730 cv_signal(cvp); 2731 if (aiop->aio_outstanding != 0) 2732 break; 2733 } 2734 } 2735 mutex_exit(&as->a_contents); 2736 } 2737 exit: 2738 mutex_exit(&as->a_contents); 2739 ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED))); 2740 aston(curthread); /* make thread do post_syscall */ 2741 return (0); 2742 } 2743 2744 /* 2745 * save a reference to a user's outstanding aio in a hash list. 2746 */ 2747 static int 2748 aio_hash_insert( 2749 aio_req_t *aio_reqp, 2750 aio_t *aiop) 2751 { 2752 long index; 2753 aio_result_t *resultp = aio_reqp->aio_req_resultp; 2754 aio_req_t *current; 2755 aio_req_t **nextp; 2756 2757 index = AIO_HASH(resultp); 2758 nextp = &aiop->aio_hash[index]; 2759 while ((current = *nextp) != NULL) { 2760 if (current->aio_req_resultp == resultp) 2761 return (DUPLICATE); 2762 nextp = ¤t->aio_hash_next; 2763 } 2764 *nextp = aio_reqp; 2765 aio_reqp->aio_hash_next = NULL; 2766 return (0); 2767 } 2768 2769 static int 2770 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *, 2771 cred_t *) 2772 { 2773 struct snode *sp; 2774 dev_t dev; 2775 struct cb_ops *cb; 2776 major_t major; 2777 int (*aio_func)(); 2778 2779 dev = vp->v_rdev; 2780 major = getmajor(dev); 2781 2782 /* 2783 * return NULL for requests to files and STREAMs so 2784 * that libaio takes care of them. 2785 */ 2786 if (vp->v_type == VCHR) { 2787 /* no stream device for kaio */ 2788 if (STREAMSTAB(major)) { 2789 return (NULL); 2790 } 2791 } else { 2792 return (NULL); 2793 } 2794 2795 /* 2796 * Check old drivers which do not have async I/O entry points. 2797 */ 2798 if (devopsp[major]->devo_rev < 3) 2799 return (NULL); 2800 2801 cb = devopsp[major]->devo_cb_ops; 2802 2803 if (cb->cb_rev < 1) 2804 return (NULL); 2805 2806 /* 2807 * Check whether this device is a block device. 2808 * Kaio is not supported for devices like tty. 2809 */ 2810 if (cb->cb_strategy == nodev || cb->cb_strategy == NULL) 2811 return (NULL); 2812 2813 /* 2814 * Clustering: If vnode is a PXFS vnode, then the device may be remote. 2815 * We cannot call the driver directly. Instead return the 2816 * PXFS functions. 2817 */ 2818 2819 if (IS_PXFSVP(vp)) { 2820 if (mode & FREAD) 2821 return (clpxfs_aio_read); 2822 else 2823 return (clpxfs_aio_write); 2824 } 2825 if (mode & FREAD) 2826 aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read; 2827 else 2828 aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write; 2829 2830 /* 2831 * Do we need this ? 2832 * nodev returns ENXIO anyway. 2833 */ 2834 if (aio_func == nodev) 2835 return (NULL); 2836 2837 sp = VTOS(vp); 2838 smark(sp, SACC); 2839 return (aio_func); 2840 } 2841 2842 /* 2843 * Clustering: We want check_vp to return a function prototyped 2844 * correctly that will be common to both PXFS and regular case. 2845 * We define this intermediate function that will do the right 2846 * thing for driver cases. 2847 */ 2848 2849 static int 2850 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p) 2851 { 2852 dev_t dev; 2853 struct cb_ops *cb; 2854 2855 ASSERT(vp->v_type == VCHR); 2856 ASSERT(!IS_PXFSVP(vp)); 2857 dev = VTOS(vp)->s_dev; 2858 ASSERT(STREAMSTAB(getmajor(dev)) == NULL); 2859 2860 cb = devopsp[getmajor(dev)]->devo_cb_ops; 2861 2862 ASSERT(cb->cb_awrite != nodev); 2863 return ((*cb->cb_awrite)(dev, aio, cred_p)); 2864 } 2865 2866 /* 2867 * Clustering: We want check_vp to return a function prototyped 2868 * correctly that will be common to both PXFS and regular case. 2869 * We define this intermediate function that will do the right 2870 * thing for driver cases. 2871 */ 2872 2873 static int 2874 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p) 2875 { 2876 dev_t dev; 2877 struct cb_ops *cb; 2878 2879 ASSERT(vp->v_type == VCHR); 2880 ASSERT(!IS_PXFSVP(vp)); 2881 dev = VTOS(vp)->s_dev; 2882 ASSERT(!STREAMSTAB(getmajor(dev))); 2883 2884 cb = devopsp[getmajor(dev)]->devo_cb_ops; 2885 2886 ASSERT(cb->cb_aread != nodev); 2887 return ((*cb->cb_aread)(dev, aio, cred_p)); 2888 } 2889 2890 /* 2891 * This routine is called when a largefile call is made by a 32bit 2892 * process on a ILP32 or LP64 kernel. All 64bit processes are large 2893 * file by definition and will call alio() instead. 2894 */ 2895 static int 2896 alioLF( 2897 int mode_arg, 2898 void *aiocb_arg, 2899 int nent, 2900 void *sigev) 2901 { 2902 file_t *fp; 2903 file_t *prev_fp = NULL; 2904 int prev_mode = -1; 2905 struct vnode *vp; 2906 aio_lio_t *head; 2907 aio_req_t *reqp; 2908 aio_t *aiop; 2909 caddr_t cbplist; 2910 aiocb64_32_t cb64; 2911 aiocb64_32_t *aiocb = &cb64; 2912 aiocb64_32_t *cbp; 2913 caddr32_t *ucbp; 2914 #ifdef _LP64 2915 aiocb_t aiocb_n; 2916 #endif 2917 struct sigevent32 sigevk; 2918 sigqueue_t *sqp; 2919 int (*aio_func)(); 2920 int mode; 2921 int error = 0; 2922 int aio_errors = 0; 2923 int i; 2924 size_t ssize; 2925 int deadhead = 0; 2926 int aio_notsupported = 0; 2927 int lio_head_port; 2928 int aio_port; 2929 int aio_thread; 2930 port_kevent_t *pkevtp = NULL; 2931 int portused = 0; 2932 port_notify32_t pnotify; 2933 int event; 2934 2935 aiop = curproc->p_aio; 2936 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 2937 return (EINVAL); 2938 2939 ASSERT(get_udatamodel() == DATAMODEL_ILP32); 2940 2941 ssize = (sizeof (caddr32_t) * nent); 2942 cbplist = kmem_alloc(ssize, KM_SLEEP); 2943 ucbp = (caddr32_t *)cbplist; 2944 2945 if (copyin(aiocb_arg, cbplist, ssize) || 2946 (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) { 2947 kmem_free(cbplist, ssize); 2948 return (EFAULT); 2949 } 2950 2951 /* Event Ports */ 2952 if (sigev && 2953 (sigevk.sigev_notify == SIGEV_THREAD || 2954 sigevk.sigev_notify == SIGEV_PORT)) { 2955 if (sigevk.sigev_notify == SIGEV_THREAD) { 2956 pnotify.portnfy_port = sigevk.sigev_signo; 2957 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 2958 } else if (copyin( 2959 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr, 2960 &pnotify, sizeof (pnotify))) { 2961 kmem_free(cbplist, ssize); 2962 return (EFAULT); 2963 } 2964 error = port_alloc_event(pnotify.portnfy_port, 2965 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 2966 if (error) { 2967 if (error == ENOMEM || error == EAGAIN) 2968 error = EAGAIN; 2969 else 2970 error = EINVAL; 2971 kmem_free(cbplist, ssize); 2972 return (error); 2973 } 2974 lio_head_port = pnotify.portnfy_port; 2975 portused = 1; 2976 } 2977 2978 /* 2979 * a list head should be allocated if notification is 2980 * enabled for this list. 2981 */ 2982 head = NULL; 2983 2984 if (mode_arg == LIO_WAIT || sigev) { 2985 mutex_enter(&aiop->aio_mutex); 2986 error = aio_lio_alloc(&head); 2987 mutex_exit(&aiop->aio_mutex); 2988 if (error) 2989 goto done; 2990 deadhead = 1; 2991 head->lio_nent = nent; 2992 head->lio_refcnt = nent; 2993 head->lio_port = -1; 2994 head->lio_portkev = NULL; 2995 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 2996 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 2997 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 2998 if (sqp == NULL) { 2999 error = EAGAIN; 3000 goto done; 3001 } 3002 sqp->sq_func = NULL; 3003 sqp->sq_next = NULL; 3004 sqp->sq_info.si_code = SI_ASYNCIO; 3005 sqp->sq_info.si_pid = curproc->p_pid; 3006 sqp->sq_info.si_ctid = PRCTID(curproc); 3007 sqp->sq_info.si_zoneid = getzoneid(); 3008 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3009 sqp->sq_info.si_signo = sigevk.sigev_signo; 3010 sqp->sq_info.si_value.sival_int = 3011 sigevk.sigev_value.sival_int; 3012 head->lio_sigqp = sqp; 3013 } else { 3014 head->lio_sigqp = NULL; 3015 } 3016 if (pkevtp) { 3017 /* 3018 * Prepare data to send when list of aiocb's 3019 * has completed. 3020 */ 3021 port_init_event(pkevtp, (uintptr_t)sigev, 3022 (void *)(uintptr_t)pnotify.portnfy_user, 3023 NULL, head); 3024 pkevtp->portkev_events = AIOLIO64; 3025 head->lio_portkev = pkevtp; 3026 head->lio_port = pnotify.portnfy_port; 3027 } 3028 } 3029 3030 for (i = 0; i < nent; i++, ucbp++) { 3031 3032 cbp = (aiocb64_32_t *)(uintptr_t)*ucbp; 3033 /* skip entry if it can't be copied. */ 3034 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) { 3035 if (head) { 3036 mutex_enter(&aiop->aio_mutex); 3037 head->lio_nent--; 3038 head->lio_refcnt--; 3039 mutex_exit(&aiop->aio_mutex); 3040 } 3041 continue; 3042 } 3043 3044 /* skip if opcode for aiocb is LIO_NOP */ 3045 mode = aiocb->aio_lio_opcode; 3046 if (mode == LIO_NOP) { 3047 cbp = NULL; 3048 if (head) { 3049 mutex_enter(&aiop->aio_mutex); 3050 head->lio_nent--; 3051 head->lio_refcnt--; 3052 mutex_exit(&aiop->aio_mutex); 3053 } 3054 continue; 3055 } 3056 3057 /* increment file descriptor's ref count. */ 3058 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 3059 lio_set_uerror(&cbp->aio_resultp, EBADF); 3060 if (head) { 3061 mutex_enter(&aiop->aio_mutex); 3062 head->lio_nent--; 3063 head->lio_refcnt--; 3064 mutex_exit(&aiop->aio_mutex); 3065 } 3066 aio_errors++; 3067 continue; 3068 } 3069 3070 /* 3071 * check the permission of the partition 3072 */ 3073 if ((fp->f_flag & mode) == 0) { 3074 releasef(aiocb->aio_fildes); 3075 lio_set_uerror(&cbp->aio_resultp, EBADF); 3076 if (head) { 3077 mutex_enter(&aiop->aio_mutex); 3078 head->lio_nent--; 3079 head->lio_refcnt--; 3080 mutex_exit(&aiop->aio_mutex); 3081 } 3082 aio_errors++; 3083 continue; 3084 } 3085 3086 /* 3087 * common case where requests are to the same fd 3088 * for the same r/w operation 3089 * for UFS, need to set EBADFD 3090 */ 3091 vp = fp->f_vnode; 3092 if (fp != prev_fp || mode != prev_mode) { 3093 aio_func = check_vp(vp, mode); 3094 if (aio_func == NULL) { 3095 prev_fp = NULL; 3096 releasef(aiocb->aio_fildes); 3097 lio_set_uerror(&cbp->aio_resultp, EBADFD); 3098 aio_notsupported++; 3099 if (head) { 3100 mutex_enter(&aiop->aio_mutex); 3101 head->lio_nent--; 3102 head->lio_refcnt--; 3103 mutex_exit(&aiop->aio_mutex); 3104 } 3105 continue; 3106 } else { 3107 prev_fp = fp; 3108 prev_mode = mode; 3109 } 3110 } 3111 3112 #ifdef _LP64 3113 aiocb_LFton(aiocb, &aiocb_n); 3114 error = aio_req_setup(&reqp, aiop, &aiocb_n, 3115 (aio_result_t *)&cbp->aio_resultp, vp, 0); 3116 #else 3117 error = aio_req_setupLF(&reqp, aiop, aiocb, 3118 (aio_result_t *)&cbp->aio_resultp, vp, 0); 3119 #endif /* _LP64 */ 3120 if (error) { 3121 releasef(aiocb->aio_fildes); 3122 lio_set_uerror(&cbp->aio_resultp, error); 3123 if (head) { 3124 mutex_enter(&aiop->aio_mutex); 3125 head->lio_nent--; 3126 head->lio_refcnt--; 3127 mutex_exit(&aiop->aio_mutex); 3128 } 3129 aio_errors++; 3130 continue; 3131 } 3132 3133 reqp->aio_req_lio = head; 3134 deadhead = 0; 3135 3136 /* 3137 * Set the errno field now before sending the request to 3138 * the driver to avoid a race condition 3139 */ 3140 (void) suword32(&cbp->aio_resultp.aio_errno, 3141 EINPROGRESS); 3142 3143 reqp->aio_req_iocb.iocb32 = *ucbp; 3144 3145 event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64; 3146 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 3147 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 3148 if (aio_port | aio_thread) { 3149 port_kevent_t *lpkevp; 3150 /* 3151 * Prepare data to send with each aiocb completed. 3152 */ 3153 if (aio_port) { 3154 void *paddr = (void *)(uintptr_t) 3155 aiocb->aio_sigevent.sigev_value.sival_ptr; 3156 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3157 error = EFAULT; 3158 } else { /* aio_thread */ 3159 pnotify.portnfy_port = 3160 aiocb->aio_sigevent.sigev_signo; 3161 pnotify.portnfy_user = 3162 aiocb->aio_sigevent.sigev_value.sival_ptr; 3163 } 3164 if (error) 3165 /* EMPTY */; 3166 else if (pkevtp != NULL && 3167 pnotify.portnfy_port == lio_head_port) 3168 error = port_dup_event(pkevtp, &lpkevp, 3169 PORT_ALLOC_DEFAULT); 3170 else 3171 error = port_alloc_event(pnotify.portnfy_port, 3172 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 3173 &lpkevp); 3174 if (error == 0) { 3175 port_init_event(lpkevp, (uintptr_t)*ucbp, 3176 (void *)(uintptr_t)pnotify.portnfy_user, 3177 aio_port_callback, reqp); 3178 lpkevp->portkev_events = event; 3179 reqp->aio_req_portkev = lpkevp; 3180 reqp->aio_req_port = pnotify.portnfy_port; 3181 } 3182 } 3183 3184 /* 3185 * send the request to driver. 3186 */ 3187 if (error == 0) { 3188 if (aiocb->aio_nbytes == 0) { 3189 clear_active_fd(aiocb->aio_fildes); 3190 aio_zerolen(reqp); 3191 continue; 3192 } 3193 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 3194 CRED()); 3195 } 3196 3197 /* 3198 * the fd's ref count is not decremented until the IO has 3199 * completed unless there was an error. 3200 */ 3201 if (error) { 3202 releasef(aiocb->aio_fildes); 3203 lio_set_uerror(&cbp->aio_resultp, error); 3204 if (head) { 3205 mutex_enter(&aiop->aio_mutex); 3206 head->lio_nent--; 3207 head->lio_refcnt--; 3208 mutex_exit(&aiop->aio_mutex); 3209 } 3210 if (error == ENOTSUP) 3211 aio_notsupported++; 3212 else 3213 aio_errors++; 3214 lio_set_error(reqp, portused); 3215 } else { 3216 clear_active_fd(aiocb->aio_fildes); 3217 } 3218 } 3219 3220 if (aio_notsupported) { 3221 error = ENOTSUP; 3222 } else if (aio_errors) { 3223 /* 3224 * return EIO if any request failed 3225 */ 3226 error = EIO; 3227 } 3228 3229 if (mode_arg == LIO_WAIT) { 3230 mutex_enter(&aiop->aio_mutex); 3231 while (head->lio_refcnt > 0) { 3232 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 3233 mutex_exit(&aiop->aio_mutex); 3234 error = EINTR; 3235 goto done; 3236 } 3237 } 3238 mutex_exit(&aiop->aio_mutex); 3239 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE); 3240 } 3241 3242 done: 3243 kmem_free(cbplist, ssize); 3244 if (deadhead) { 3245 if (head->lio_sigqp) 3246 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 3247 if (head->lio_portkev) 3248 port_free_event(head->lio_portkev); 3249 kmem_free(head, sizeof (aio_lio_t)); 3250 } 3251 return (error); 3252 } 3253 3254 #ifdef _SYSCALL32_IMPL 3255 static void 3256 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest) 3257 { 3258 dest->aio_fildes = src->aio_fildes; 3259 dest->aio_buf = (void *)(uintptr_t)src->aio_buf; 3260 dest->aio_nbytes = (size_t)src->aio_nbytes; 3261 dest->aio_offset = (off_t)src->aio_offset; 3262 dest->aio_reqprio = src->aio_reqprio; 3263 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify; 3264 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo; 3265 3266 /* 3267 * See comment in sigqueue32() on handling of 32-bit 3268 * sigvals in a 64-bit kernel. 3269 */ 3270 dest->aio_sigevent.sigev_value.sival_int = 3271 (int)src->aio_sigevent.sigev_value.sival_int; 3272 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval)) 3273 (uintptr_t)src->aio_sigevent.sigev_notify_function; 3274 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *) 3275 (uintptr_t)src->aio_sigevent.sigev_notify_attributes; 3276 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2; 3277 dest->aio_lio_opcode = src->aio_lio_opcode; 3278 dest->aio_state = src->aio_state; 3279 dest->aio__pad[0] = src->aio__pad[0]; 3280 } 3281 #endif 3282 3283 /* 3284 * This function is used only for largefile calls made by 3285 * 32 bit applications. 3286 */ 3287 static int 3288 aio_req_setupLF( 3289 aio_req_t **reqpp, 3290 aio_t *aiop, 3291 aiocb64_32_t *arg, 3292 aio_result_t *resultp, 3293 vnode_t *vp, 3294 int old_solaris_req) 3295 { 3296 sigqueue_t *sqp = NULL; 3297 aio_req_t *reqp; 3298 struct uio *uio; 3299 struct sigevent32 *sigev; 3300 int error; 3301 3302 sigev = &arg->aio_sigevent; 3303 if (sigev->sigev_notify == SIGEV_SIGNAL && 3304 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) { 3305 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3306 if (sqp == NULL) 3307 return (EAGAIN); 3308 sqp->sq_func = NULL; 3309 sqp->sq_next = NULL; 3310 sqp->sq_info.si_code = SI_ASYNCIO; 3311 sqp->sq_info.si_pid = curproc->p_pid; 3312 sqp->sq_info.si_ctid = PRCTID(curproc); 3313 sqp->sq_info.si_zoneid = getzoneid(); 3314 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3315 sqp->sq_info.si_signo = sigev->sigev_signo; 3316 sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int; 3317 } 3318 3319 mutex_enter(&aiop->aio_mutex); 3320 3321 if (aiop->aio_flags & AIO_REQ_BLOCK) { 3322 mutex_exit(&aiop->aio_mutex); 3323 if (sqp) 3324 kmem_free(sqp, sizeof (sigqueue_t)); 3325 return (EIO); 3326 } 3327 /* 3328 * get an aio_reqp from the free list or allocate one 3329 * from dynamic memory. 3330 */ 3331 if (error = aio_req_alloc(&reqp, resultp)) { 3332 mutex_exit(&aiop->aio_mutex); 3333 if (sqp) 3334 kmem_free(sqp, sizeof (sigqueue_t)); 3335 return (error); 3336 } 3337 aiop->aio_pending++; 3338 aiop->aio_outstanding++; 3339 reqp->aio_req_flags = AIO_PENDING; 3340 if (old_solaris_req) { 3341 /* this is an old solaris aio request */ 3342 reqp->aio_req_flags |= AIO_SOLARIS; 3343 aiop->aio_flags |= AIO_SOLARIS_REQ; 3344 } 3345 if (sigev->sigev_notify == SIGEV_THREAD || 3346 sigev->sigev_notify == SIGEV_PORT) 3347 aio_enq(&aiop->aio_portpending, reqp, 0); 3348 mutex_exit(&aiop->aio_mutex); 3349 /* 3350 * initialize aio request. 3351 */ 3352 reqp->aio_req_fd = arg->aio_fildes; 3353 reqp->aio_req_sigqp = sqp; 3354 reqp->aio_req_iocb.iocb = NULL; 3355 reqp->aio_req_lio = NULL; 3356 reqp->aio_req_buf.b_file = vp; 3357 uio = reqp->aio_req.aio_uio; 3358 uio->uio_iovcnt = 1; 3359 uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf; 3360 uio->uio_iov->iov_len = arg->aio_nbytes; 3361 uio->uio_loffset = arg->aio_offset; 3362 *reqpp = reqp; 3363 return (0); 3364 } 3365 3366 /* 3367 * This routine is called when a non largefile call is made by a 32bit 3368 * process on a ILP32 or LP64 kernel. 3369 */ 3370 static int 3371 alio32( 3372 int mode_arg, 3373 void *aiocb_arg, 3374 int nent, 3375 void *sigev) 3376 { 3377 file_t *fp; 3378 file_t *prev_fp = NULL; 3379 int prev_mode = -1; 3380 struct vnode *vp; 3381 aio_lio_t *head; 3382 aio_req_t *reqp; 3383 aio_t *aiop; 3384 caddr_t cbplist; 3385 aiocb_t cb; 3386 aiocb_t *aiocb = &cb; 3387 #ifdef _LP64 3388 aiocb32_t *cbp; 3389 caddr32_t *ucbp; 3390 aiocb32_t cb32; 3391 aiocb32_t *aiocb32 = &cb32; 3392 struct sigevent32 sigevk; 3393 #else 3394 aiocb_t *cbp, **ucbp; 3395 struct sigevent sigevk; 3396 #endif 3397 sigqueue_t *sqp; 3398 int (*aio_func)(); 3399 int mode; 3400 int error = 0; 3401 int aio_errors = 0; 3402 int i; 3403 size_t ssize; 3404 int deadhead = 0; 3405 int aio_notsupported = 0; 3406 int lio_head_port; 3407 int aio_port; 3408 int aio_thread; 3409 port_kevent_t *pkevtp = NULL; 3410 int portused = 0; 3411 #ifdef _LP64 3412 port_notify32_t pnotify; 3413 #else 3414 port_notify_t pnotify; 3415 #endif 3416 int event; 3417 3418 aiop = curproc->p_aio; 3419 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 3420 return (EINVAL); 3421 3422 #ifdef _LP64 3423 ssize = (sizeof (caddr32_t) * nent); 3424 #else 3425 ssize = (sizeof (aiocb_t *) * nent); 3426 #endif 3427 cbplist = kmem_alloc(ssize, KM_SLEEP); 3428 ucbp = (void *)cbplist; 3429 3430 if (copyin(aiocb_arg, cbplist, ssize) || 3431 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) { 3432 kmem_free(cbplist, ssize); 3433 return (EFAULT); 3434 } 3435 3436 /* Event Ports */ 3437 if (sigev && 3438 (sigevk.sigev_notify == SIGEV_THREAD || 3439 sigevk.sigev_notify == SIGEV_PORT)) { 3440 if (sigevk.sigev_notify == SIGEV_THREAD) { 3441 pnotify.portnfy_port = sigevk.sigev_signo; 3442 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 3443 } else if (copyin( 3444 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr, 3445 &pnotify, sizeof (pnotify))) { 3446 kmem_free(cbplist, ssize); 3447 return (EFAULT); 3448 } 3449 error = port_alloc_event(pnotify.portnfy_port, 3450 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 3451 if (error) { 3452 if (error == ENOMEM || error == EAGAIN) 3453 error = EAGAIN; 3454 else 3455 error = EINVAL; 3456 kmem_free(cbplist, ssize); 3457 return (error); 3458 } 3459 lio_head_port = pnotify.portnfy_port; 3460 portused = 1; 3461 } 3462 3463 /* 3464 * a list head should be allocated if notification is 3465 * enabled for this list. 3466 */ 3467 head = NULL; 3468 3469 if (mode_arg == LIO_WAIT || sigev) { 3470 mutex_enter(&aiop->aio_mutex); 3471 error = aio_lio_alloc(&head); 3472 mutex_exit(&aiop->aio_mutex); 3473 if (error) 3474 goto done; 3475 deadhead = 1; 3476 head->lio_nent = nent; 3477 head->lio_refcnt = nent; 3478 head->lio_port = -1; 3479 head->lio_portkev = NULL; 3480 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 3481 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 3482 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3483 if (sqp == NULL) { 3484 error = EAGAIN; 3485 goto done; 3486 } 3487 sqp->sq_func = NULL; 3488 sqp->sq_next = NULL; 3489 sqp->sq_info.si_code = SI_ASYNCIO; 3490 sqp->sq_info.si_pid = curproc->p_pid; 3491 sqp->sq_info.si_ctid = PRCTID(curproc); 3492 sqp->sq_info.si_zoneid = getzoneid(); 3493 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3494 sqp->sq_info.si_signo = sigevk.sigev_signo; 3495 sqp->sq_info.si_value.sival_int = 3496 sigevk.sigev_value.sival_int; 3497 head->lio_sigqp = sqp; 3498 } else { 3499 head->lio_sigqp = NULL; 3500 } 3501 if (pkevtp) { 3502 /* 3503 * Prepare data to send when list of aiocb's has 3504 * completed. 3505 */ 3506 port_init_event(pkevtp, (uintptr_t)sigev, 3507 (void *)(uintptr_t)pnotify.portnfy_user, 3508 NULL, head); 3509 pkevtp->portkev_events = AIOLIO; 3510 head->lio_portkev = pkevtp; 3511 head->lio_port = pnotify.portnfy_port; 3512 } 3513 } 3514 3515 for (i = 0; i < nent; i++, ucbp++) { 3516 3517 /* skip entry if it can't be copied. */ 3518 #ifdef _LP64 3519 cbp = (aiocb32_t *)(uintptr_t)*ucbp; 3520 if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32))) 3521 #else 3522 cbp = (aiocb_t *)*ucbp; 3523 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) 3524 #endif 3525 { 3526 if (head) { 3527 mutex_enter(&aiop->aio_mutex); 3528 head->lio_nent--; 3529 head->lio_refcnt--; 3530 mutex_exit(&aiop->aio_mutex); 3531 } 3532 continue; 3533 } 3534 #ifdef _LP64 3535 /* 3536 * copy 32 bit structure into 64 bit structure 3537 */ 3538 aiocb_32ton(aiocb32, aiocb); 3539 #endif /* _LP64 */ 3540 3541 /* skip if opcode for aiocb is LIO_NOP */ 3542 mode = aiocb->aio_lio_opcode; 3543 if (mode == LIO_NOP) { 3544 cbp = NULL; 3545 if (head) { 3546 mutex_enter(&aiop->aio_mutex); 3547 head->lio_nent--; 3548 head->lio_refcnt--; 3549 mutex_exit(&aiop->aio_mutex); 3550 } 3551 continue; 3552 } 3553 3554 /* increment file descriptor's ref count. */ 3555 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 3556 lio_set_uerror(&cbp->aio_resultp, EBADF); 3557 if (head) { 3558 mutex_enter(&aiop->aio_mutex); 3559 head->lio_nent--; 3560 head->lio_refcnt--; 3561 mutex_exit(&aiop->aio_mutex); 3562 } 3563 aio_errors++; 3564 continue; 3565 } 3566 3567 /* 3568 * check the permission of the partition 3569 */ 3570 if ((fp->f_flag & mode) == 0) { 3571 releasef(aiocb->aio_fildes); 3572 lio_set_uerror(&cbp->aio_resultp, EBADF); 3573 if (head) { 3574 mutex_enter(&aiop->aio_mutex); 3575 head->lio_nent--; 3576 head->lio_refcnt--; 3577 mutex_exit(&aiop->aio_mutex); 3578 } 3579 aio_errors++; 3580 continue; 3581 } 3582 3583 /* 3584 * common case where requests are to the same fd 3585 * for the same r/w operation 3586 * for UFS, need to set EBADFD 3587 */ 3588 vp = fp->f_vnode; 3589 if (fp != prev_fp || mode != prev_mode) { 3590 aio_func = check_vp(vp, mode); 3591 if (aio_func == NULL) { 3592 prev_fp = NULL; 3593 releasef(aiocb->aio_fildes); 3594 lio_set_uerror(&cbp->aio_resultp, EBADFD); 3595 aio_notsupported++; 3596 if (head) { 3597 mutex_enter(&aiop->aio_mutex); 3598 head->lio_nent--; 3599 head->lio_refcnt--; 3600 mutex_exit(&aiop->aio_mutex); 3601 } 3602 continue; 3603 } else { 3604 prev_fp = fp; 3605 prev_mode = mode; 3606 } 3607 } 3608 3609 error = aio_req_setup(&reqp, aiop, aiocb, 3610 (aio_result_t *)&cbp->aio_resultp, vp, 0); 3611 if (error) { 3612 releasef(aiocb->aio_fildes); 3613 lio_set_uerror(&cbp->aio_resultp, error); 3614 if (head) { 3615 mutex_enter(&aiop->aio_mutex); 3616 head->lio_nent--; 3617 head->lio_refcnt--; 3618 mutex_exit(&aiop->aio_mutex); 3619 } 3620 aio_errors++; 3621 continue; 3622 } 3623 3624 reqp->aio_req_lio = head; 3625 deadhead = 0; 3626 3627 /* 3628 * Set the errno field now before sending the request to 3629 * the driver to avoid a race condition 3630 */ 3631 (void) suword32(&cbp->aio_resultp.aio_errno, 3632 EINPROGRESS); 3633 3634 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp; 3635 3636 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE; 3637 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 3638 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 3639 if (aio_port | aio_thread) { 3640 port_kevent_t *lpkevp; 3641 /* 3642 * Prepare data to send with each aiocb completed. 3643 */ 3644 #ifdef _LP64 3645 if (aio_port) { 3646 void *paddr = (void *)(uintptr_t) 3647 aiocb32->aio_sigevent.sigev_value.sival_ptr; 3648 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3649 error = EFAULT; 3650 } else { /* aio_thread */ 3651 pnotify.portnfy_port = 3652 aiocb32->aio_sigevent.sigev_signo; 3653 pnotify.portnfy_user = 3654 aiocb32->aio_sigevent.sigev_value.sival_ptr; 3655 } 3656 #else 3657 if (aio_port) { 3658 void *paddr = 3659 aiocb->aio_sigevent.sigev_value.sival_ptr; 3660 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3661 error = EFAULT; 3662 } else { /* aio_thread */ 3663 pnotify.portnfy_port = 3664 aiocb->aio_sigevent.sigev_signo; 3665 pnotify.portnfy_user = 3666 aiocb->aio_sigevent.sigev_value.sival_ptr; 3667 } 3668 #endif 3669 if (error) 3670 /* EMPTY */; 3671 else if (pkevtp != NULL && 3672 pnotify.portnfy_port == lio_head_port) 3673 error = port_dup_event(pkevtp, &lpkevp, 3674 PORT_ALLOC_DEFAULT); 3675 else 3676 error = port_alloc_event(pnotify.portnfy_port, 3677 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 3678 &lpkevp); 3679 if (error == 0) { 3680 port_init_event(lpkevp, (uintptr_t)cbp, 3681 (void *)(uintptr_t)pnotify.portnfy_user, 3682 aio_port_callback, reqp); 3683 lpkevp->portkev_events = event; 3684 reqp->aio_req_portkev = lpkevp; 3685 reqp->aio_req_port = pnotify.portnfy_port; 3686 } 3687 } 3688 3689 /* 3690 * send the request to driver. 3691 */ 3692 if (error == 0) { 3693 if (aiocb->aio_nbytes == 0) { 3694 clear_active_fd(aiocb->aio_fildes); 3695 aio_zerolen(reqp); 3696 continue; 3697 } 3698 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 3699 CRED()); 3700 } 3701 3702 /* 3703 * the fd's ref count is not decremented until the IO has 3704 * completed unless there was an error. 3705 */ 3706 if (error) { 3707 releasef(aiocb->aio_fildes); 3708 lio_set_uerror(&cbp->aio_resultp, error); 3709 if (head) { 3710 mutex_enter(&aiop->aio_mutex); 3711 head->lio_nent--; 3712 head->lio_refcnt--; 3713 mutex_exit(&aiop->aio_mutex); 3714 } 3715 if (error == ENOTSUP) 3716 aio_notsupported++; 3717 else 3718 aio_errors++; 3719 lio_set_error(reqp, portused); 3720 } else { 3721 clear_active_fd(aiocb->aio_fildes); 3722 } 3723 } 3724 3725 if (aio_notsupported) { 3726 error = ENOTSUP; 3727 } else if (aio_errors) { 3728 /* 3729 * return EIO if any request failed 3730 */ 3731 error = EIO; 3732 } 3733 3734 if (mode_arg == LIO_WAIT) { 3735 mutex_enter(&aiop->aio_mutex); 3736 while (head->lio_refcnt > 0) { 3737 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 3738 mutex_exit(&aiop->aio_mutex); 3739 error = EINTR; 3740 goto done; 3741 } 3742 } 3743 mutex_exit(&aiop->aio_mutex); 3744 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32); 3745 } 3746 3747 done: 3748 kmem_free(cbplist, ssize); 3749 if (deadhead) { 3750 if (head->lio_sigqp) 3751 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 3752 if (head->lio_portkev) 3753 port_free_event(head->lio_portkev); 3754 kmem_free(head, sizeof (aio_lio_t)); 3755 } 3756 return (error); 3757 } 3758 3759 3760 #ifdef _SYSCALL32_IMPL 3761 void 3762 aiocb_32ton(aiocb32_t *src, aiocb_t *dest) 3763 { 3764 dest->aio_fildes = src->aio_fildes; 3765 dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf; 3766 dest->aio_nbytes = (size_t)src->aio_nbytes; 3767 dest->aio_offset = (off_t)src->aio_offset; 3768 dest->aio_reqprio = src->aio_reqprio; 3769 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify; 3770 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo; 3771 3772 /* 3773 * See comment in sigqueue32() on handling of 32-bit 3774 * sigvals in a 64-bit kernel. 3775 */ 3776 dest->aio_sigevent.sigev_value.sival_int = 3777 (int)src->aio_sigevent.sigev_value.sival_int; 3778 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval)) 3779 (uintptr_t)src->aio_sigevent.sigev_notify_function; 3780 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *) 3781 (uintptr_t)src->aio_sigevent.sigev_notify_attributes; 3782 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2; 3783 dest->aio_lio_opcode = src->aio_lio_opcode; 3784 dest->aio_state = src->aio_state; 3785 dest->aio__pad[0] = src->aio__pad[0]; 3786 } 3787 #endif /* _SYSCALL32_IMPL */ 3788 3789 /* 3790 * aio_port_callback() is called just before the event is retrieved from the 3791 * port. The task of this callback function is to finish the work of the 3792 * transaction for the application, it means : 3793 * - copyout transaction data to the application 3794 * (this thread is running in the right process context) 3795 * - keep trace of the transaction (update of counters). 3796 * - free allocated buffers 3797 * The aiocb pointer is the object element of the port_kevent_t structure. 3798 * 3799 * flag : 3800 * PORT_CALLBACK_DEFAULT : do copyout and free resources 3801 * PORT_CALLBACK_CLOSE : don't do copyout, free resources 3802 */ 3803 3804 /*ARGSUSED*/ 3805 int 3806 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp) 3807 { 3808 aio_t *aiop = curproc->p_aio; 3809 aio_req_t *reqp = arg; 3810 struct iovec *iov; 3811 struct buf *bp; 3812 void *resultp; 3813 3814 if (pid != curproc->p_pid) { 3815 /* wrong proc !!, can not deliver data here ... */ 3816 return (EACCES); 3817 } 3818 3819 mutex_enter(&aiop->aio_portq_mutex); 3820 reqp->aio_req_portkev = NULL; 3821 aio_req_remove_portq(aiop, reqp); /* remove request from portq */ 3822 mutex_exit(&aiop->aio_portq_mutex); 3823 aphysio_unlock(reqp); /* unlock used pages */ 3824 mutex_enter(&aiop->aio_mutex); 3825 if (reqp->aio_req_flags & AIO_COPYOUTDONE) { 3826 aio_req_free_port(aiop, reqp); /* back to free list */ 3827 mutex_exit(&aiop->aio_mutex); 3828 return (0); 3829 } 3830 3831 iov = reqp->aio_req_uio.uio_iov; 3832 bp = &reqp->aio_req_buf; 3833 resultp = (void *)reqp->aio_req_resultp; 3834 if (flag == PORT_CALLBACK_DEFAULT) 3835 aio_copyout_result_port(iov, bp, resultp); 3836 aio_req_free_port(aiop, reqp); /* request struct back to free list */ 3837 mutex_exit(&aiop->aio_mutex); 3838 return (0); 3839 } 3840