1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Kernel asynchronous I/O. 31 * This is only for raw devices now (as of Nov. 1993). 32 */ 33 34 #include <sys/types.h> 35 #include <sys/errno.h> 36 #include <sys/conf.h> 37 #include <sys/file.h> 38 #include <sys/fs/snode.h> 39 #include <sys/unistd.h> 40 #include <sys/cmn_err.h> 41 #include <vm/as.h> 42 #include <vm/faultcode.h> 43 #include <sys/sysmacros.h> 44 #include <sys/procfs.h> 45 #include <sys/kmem.h> 46 #include <sys/autoconf.h> 47 #include <sys/ddi_impldefs.h> 48 #include <sys/sunddi.h> 49 #include <sys/aio_impl.h> 50 #include <sys/debug.h> 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/vmsystm.h> 54 #include <sys/fs/pxfs_ki.h> 55 #include <sys/contract/process_impl.h> 56 57 /* 58 * external entry point. 59 */ 60 #ifdef _LP64 61 static int64_t kaioc(long, long, long, long, long, long); 62 #endif 63 static int kaio(ulong_t *, rval_t *); 64 65 66 #define AIO_64 0 67 #define AIO_32 1 68 #define AIO_LARGEFILE 2 69 70 /* 71 * implementation specific functions (private) 72 */ 73 #ifdef _LP64 74 static int alio(int, int, aiocb_t **, int, struct sigevent *); 75 #endif 76 static int aionotify(void); 77 static int aioinit(void); 78 static int aiostart(void); 79 static void alio_cleanup(aio_t *, aiocb_t **, int, int); 80 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *, 81 cred_t *); 82 static void lio_set_error(aio_req_t *); 83 static aio_t *aio_aiop_alloc(); 84 static int aio_req_alloc(aio_req_t **, aio_result_t *); 85 static int aio_lio_alloc(aio_lio_t **); 86 static aio_req_t *aio_req_done(void *); 87 static aio_req_t *aio_req_remove(aio_req_t *); 88 static int aio_req_find(aio_result_t *, aio_req_t **); 89 static int aio_hash_insert(struct aio_req_t *, aio_t *); 90 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *, 91 aio_result_t *, int, vnode_t *); 92 static int aio_cleanup_thread(aio_t *); 93 static aio_lio_t *aio_list_get(aio_result_t *); 94 static void lio_set_uerror(void *, int); 95 extern void aio_zerolen(aio_req_t *); 96 static int aiowait(struct timeval *, int, long *); 97 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *); 98 static int aio_unlock_requests(caddr_t iocblist, int iocb_index, 99 aio_req_t *reqlist, aio_t *aiop, model_t model); 100 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max); 101 static int aiosuspend(void *, int, struct timespec *, int, 102 long *, int); 103 static int aliowait(int, void *, int, void *, int); 104 static int aioerror(void *, int); 105 static int aio_cancel(int, void *, long *, int); 106 static int arw(int, int, char *, int, offset_t, aio_result_t *, int); 107 static int aiorw(int, void *, int, int); 108 109 static int alioLF(int, void *, int, void *); 110 static int aio_req_setupLF(aio_req_t **, aio_t *, 111 aiocb64_32_t *, aio_result_t *, int, vnode_t *); 112 static int alio32(int, void *, int, void *); 113 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p); 114 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p); 115 116 #ifdef _SYSCALL32_IMPL 117 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *); 118 void aiocb_32ton(aiocb32_t *, aiocb_t *); 119 #endif /* _SYSCALL32_IMPL */ 120 121 /* 122 * implementation specific functions (external) 123 */ 124 void aio_req_free(aio_t *, aio_req_t *); 125 126 /* 127 * Event Port framework 128 */ 129 130 void aio_req_free_port(aio_t *, aio_req_t *); 131 static int aio_port_callback(void *, int *, pid_t, int, void *); 132 133 /* 134 * This is the loadable module wrapper. 135 */ 136 #include <sys/modctl.h> 137 #include <sys/syscall.h> 138 139 #ifdef _LP64 140 141 static struct sysent kaio_sysent = { 142 6, 143 SE_NOUNLOAD | SE_64RVAL | SE_ARGC, 144 (int (*)())kaioc 145 }; 146 147 #ifdef _SYSCALL32_IMPL 148 static struct sysent kaio_sysent32 = { 149 7, 150 SE_NOUNLOAD | SE_64RVAL, 151 kaio 152 }; 153 #endif /* _SYSCALL32_IMPL */ 154 155 #else /* _LP64 */ 156 157 static struct sysent kaio_sysent = { 158 7, 159 SE_NOUNLOAD | SE_32RVAL1, 160 kaio 161 }; 162 163 #endif /* _LP64 */ 164 165 /* 166 * Module linkage information for the kernel. 167 */ 168 169 static struct modlsys modlsys = { 170 &mod_syscallops, 171 "kernel Async I/O", 172 &kaio_sysent 173 }; 174 175 #ifdef _SYSCALL32_IMPL 176 static struct modlsys modlsys32 = { 177 &mod_syscallops32, 178 "kernel Async I/O for 32 bit compatibility", 179 &kaio_sysent32 180 }; 181 #endif /* _SYSCALL32_IMPL */ 182 183 184 static struct modlinkage modlinkage = { 185 MODREV_1, 186 &modlsys, 187 #ifdef _SYSCALL32_IMPL 188 &modlsys32, 189 #endif 190 NULL 191 }; 192 193 int 194 _init(void) 195 { 196 int retval; 197 198 if ((retval = mod_install(&modlinkage)) != 0) 199 return (retval); 200 201 return (0); 202 } 203 204 int 205 _fini(void) 206 { 207 int retval; 208 209 retval = mod_remove(&modlinkage); 210 211 return (retval); 212 } 213 214 int 215 _info(struct modinfo *modinfop) 216 { 217 return (mod_info(&modlinkage, modinfop)); 218 } 219 220 #ifdef _LP64 221 static int64_t 222 kaioc( 223 long a0, 224 long a1, 225 long a2, 226 long a3, 227 long a4, 228 long a5) 229 { 230 int error; 231 long rval = 0; 232 233 switch ((int)a0 & ~AIO_POLL_BIT) { 234 case AIOREAD: 235 error = arw((int)a0, (int)a1, (char *)a2, (int)a3, 236 (offset_t)a4, (aio_result_t *)a5, FREAD); 237 break; 238 case AIOWRITE: 239 error = arw((int)a0, (int)a1, (char *)a2, (int)a3, 240 (offset_t)a4, (aio_result_t *)a5, FWRITE); 241 break; 242 case AIOWAIT: 243 error = aiowait((struct timeval *)a1, (int)a2, &rval); 244 break; 245 case AIOWAITN: 246 error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3, 247 (timespec_t *)a4); 248 break; 249 case AIONOTIFY: 250 error = aionotify(); 251 break; 252 case AIOINIT: 253 error = aioinit(); 254 break; 255 case AIOSTART: 256 error = aiostart(); 257 break; 258 case AIOLIO: 259 error = alio((int)a0, (int)a1, (aiocb_t **)a2, (int)a3, 260 (struct sigevent *)a4); 261 break; 262 case AIOLIOWAIT: 263 error = aliowait((int)a1, (void *)a2, (int)a3, 264 (struct sigevent *)a4, AIO_64); 265 break; 266 case AIOSUSPEND: 267 error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3, 268 (int)a4, &rval, AIO_64); 269 break; 270 case AIOERROR: 271 error = aioerror((void *)a1, AIO_64); 272 break; 273 case AIOAREAD: 274 error = aiorw((int)a0, (void *)a1, FREAD, AIO_64); 275 break; 276 case AIOAWRITE: 277 error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64); 278 break; 279 case AIOCANCEL: 280 error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64); 281 break; 282 283 /* 284 * The large file related stuff is valid only for 285 * 32 bit kernel and not for 64 bit kernel 286 * On 64 bit kernel we convert large file calls 287 * to regular 64bit calls. 288 */ 289 290 default: 291 error = EINVAL; 292 } 293 if (error) 294 return ((int64_t)set_errno(error)); 295 return (rval); 296 } 297 #endif 298 299 static int 300 kaio( 301 ulong_t *uap, 302 rval_t *rvp) 303 { 304 long rval = 0; 305 int error = 0; 306 offset_t off; 307 308 309 rvp->r_vals = 0; 310 #if defined(_LITTLE_ENDIAN) 311 off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4]; 312 #else 313 off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5]; 314 #endif 315 316 switch (uap[0] & ~AIO_POLL_BIT) { 317 /* 318 * It must be the 32 bit system call on 64 bit kernel 319 */ 320 case AIOREAD: 321 return (arw((int)uap[0], (int)uap[1], (char *)uap[2], 322 (int)uap[3], off, (aio_result_t *)uap[6], FREAD)); 323 case AIOWRITE: 324 return (arw((int)uap[0], (int)uap[1], (char *)uap[2], 325 (int)uap[3], off, (aio_result_t *)uap[6], FWRITE)); 326 case AIOWAIT: 327 error = aiowait((struct timeval *)uap[1], (int)uap[2], 328 &rval); 329 break; 330 case AIOWAITN: 331 error = aiowaitn((void *)uap[1], (uint_t)uap[2], 332 (uint_t *)uap[3], (timespec_t *)uap[4]); 333 break; 334 case AIONOTIFY: 335 return (aionotify()); 336 case AIOINIT: 337 return (aioinit()); 338 case AIOSTART: 339 return (aiostart()); 340 case AIOLIO: 341 return (alio32((int)uap[1], (void *)uap[2], (int)uap[3], 342 (void *)uap[4])); 343 case AIOLIOWAIT: 344 return (aliowait((int)uap[1], (void *)uap[2], 345 (int)uap[3], (struct sigevent *)uap[4], AIO_32)); 346 case AIOSUSPEND: 347 error = aiosuspend((void *)uap[1], (int)uap[2], 348 (timespec_t *)uap[3], (int)uap[4], 349 &rval, AIO_32); 350 break; 351 case AIOERROR: 352 return (aioerror((void *)uap[1], AIO_32)); 353 case AIOAREAD: 354 return (aiorw((int)uap[0], (void *)uap[1], 355 FREAD, AIO_32)); 356 case AIOAWRITE: 357 return (aiorw((int)uap[0], (void *)uap[1], 358 FWRITE, AIO_32)); 359 case AIOCANCEL: 360 error = (aio_cancel((int)uap[1], (void *)uap[2], &rval, 361 AIO_32)); 362 break; 363 case AIOLIO64: 364 return (alioLF((int)uap[1], (void *)uap[2], 365 (int)uap[3], (void *)uap[4])); 366 case AIOLIOWAIT64: 367 return (aliowait(uap[1], (void *)uap[2], 368 (int)uap[3], (void *)uap[4], AIO_LARGEFILE)); 369 case AIOSUSPEND64: 370 error = aiosuspend((void *)uap[1], (int)uap[2], 371 (timespec_t *)uap[3], (int)uap[4], &rval, 372 AIO_LARGEFILE); 373 break; 374 case AIOERROR64: 375 return (aioerror((void *)uap[1], AIO_LARGEFILE)); 376 case AIOAREAD64: 377 return (aiorw((int)uap[0], (void *)uap[1], FREAD, 378 AIO_LARGEFILE)); 379 case AIOAWRITE64: 380 return (aiorw((int)uap[0], (void *)uap[1], FWRITE, 381 AIO_LARGEFILE)); 382 case AIOCANCEL64: 383 error = (aio_cancel((int)uap[1], (void *)uap[2], 384 &rval, AIO_LARGEFILE)); 385 break; 386 default: 387 return (EINVAL); 388 } 389 390 rvp->r_val1 = rval; 391 return (error); 392 } 393 394 /* 395 * wake up LWPs in this process that are sleeping in 396 * aiowait(). 397 */ 398 static int 399 aionotify(void) 400 { 401 aio_t *aiop; 402 403 aiop = curproc->p_aio; 404 if (aiop == NULL) 405 return (0); 406 407 mutex_enter(&aiop->aio_mutex); 408 aiop->aio_notifycnt++; 409 cv_broadcast(&aiop->aio_waitcv); 410 mutex_exit(&aiop->aio_mutex); 411 412 return (0); 413 } 414 415 static int 416 timeval2reltime(struct timeval *timout, timestruc_t *rqtime, 417 timestruc_t **rqtp, int *blocking) 418 { 419 #ifdef _SYSCALL32_IMPL 420 struct timeval32 wait_time_32; 421 #endif 422 struct timeval wait_time; 423 model_t model = get_udatamodel(); 424 425 *rqtp = NULL; 426 if (timout == NULL) { /* wait indefinitely */ 427 *blocking = 1; 428 return (0); 429 } 430 431 /* 432 * Need to correctly compare with the -1 passed in for a user 433 * address pointer, with both 32 bit and 64 bit apps. 434 */ 435 if (model == DATAMODEL_NATIVE) { 436 if ((intptr_t)timout == (intptr_t)-1) { /* don't wait */ 437 *blocking = 0; 438 return (0); 439 } 440 441 if (copyin(timout, &wait_time, sizeof (wait_time))) 442 return (EFAULT); 443 } 444 #ifdef _SYSCALL32_IMPL 445 else { 446 /* 447 * -1 from a 32bit app. It will not get sign extended. 448 * don't wait if -1. 449 */ 450 if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) { 451 *blocking = 0; 452 return (0); 453 } 454 455 if (copyin(timout, &wait_time_32, sizeof (wait_time_32))) 456 return (EFAULT); 457 TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32); 458 } 459 #endif /* _SYSCALL32_IMPL */ 460 461 if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) { /* don't wait */ 462 *blocking = 0; 463 return (0); 464 } 465 466 if (wait_time.tv_sec < 0 || 467 wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC) 468 return (EINVAL); 469 470 rqtime->tv_sec = wait_time.tv_sec; 471 rqtime->tv_nsec = wait_time.tv_usec * 1000; 472 *rqtp = rqtime; 473 *blocking = 1; 474 475 return (0); 476 } 477 478 static int 479 timespec2reltime(timespec_t *timout, timestruc_t *rqtime, 480 timestruc_t **rqtp, int *blocking) 481 { 482 #ifdef _SYSCALL32_IMPL 483 timespec32_t wait_time_32; 484 #endif 485 model_t model = get_udatamodel(); 486 487 *rqtp = NULL; 488 if (timout == NULL) { 489 *blocking = 1; 490 return (0); 491 } 492 493 if (model == DATAMODEL_NATIVE) { 494 if (copyin(timout, rqtime, sizeof (*rqtime))) 495 return (EFAULT); 496 } 497 #ifdef _SYSCALL32_IMPL 498 else { 499 if (copyin(timout, &wait_time_32, sizeof (wait_time_32))) 500 return (EFAULT); 501 TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32); 502 } 503 #endif /* _SYSCALL32_IMPL */ 504 505 if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) { 506 *blocking = 0; 507 return (0); 508 } 509 510 if (rqtime->tv_sec < 0 || 511 rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC) 512 return (EINVAL); 513 514 *rqtp = rqtime; 515 *blocking = 1; 516 517 return (0); 518 } 519 520 /*ARGSUSED*/ 521 static int 522 aiowait( 523 struct timeval *timout, 524 int dontblockflg, 525 long *rval) 526 { 527 int error; 528 aio_t *aiop; 529 aio_req_t *reqp; 530 clock_t status; 531 int blocking; 532 int timecheck; 533 timestruc_t rqtime; 534 timestruc_t *rqtp; 535 536 aiop = curproc->p_aio; 537 if (aiop == NULL) 538 return (EINVAL); 539 540 /* 541 * Establish the absolute future time for the timeout. 542 */ 543 error = timeval2reltime(timout, &rqtime, &rqtp, &blocking); 544 if (error) 545 return (error); 546 if (rqtp) { 547 timestruc_t now; 548 timecheck = timechanged; 549 gethrestime(&now); 550 timespecadd(rqtp, &now); 551 } 552 553 mutex_enter(&aiop->aio_mutex); 554 for (;;) { 555 /* process requests on poll queue */ 556 if (aiop->aio_pollq) { 557 mutex_exit(&aiop->aio_mutex); 558 aio_cleanup(0); 559 mutex_enter(&aiop->aio_mutex); 560 } 561 if ((reqp = aio_req_remove(NULL)) != NULL) { 562 *rval = (long)reqp->aio_req_resultp; 563 break; 564 } 565 /* user-level done queue might not be empty */ 566 if (aiop->aio_notifycnt > 0) { 567 aiop->aio_notifycnt--; 568 *rval = 1; 569 break; 570 } 571 /* don't block if no outstanding aio */ 572 if (aiop->aio_outstanding == 0 && dontblockflg) { 573 error = EINVAL; 574 break; 575 } 576 if (blocking) { 577 status = cv_waituntil_sig(&aiop->aio_waitcv, 578 &aiop->aio_mutex, rqtp, timecheck); 579 580 if (status > 0) /* check done queue again */ 581 continue; 582 if (status == 0) { /* interrupted by a signal */ 583 error = EINTR; 584 *rval = -1; 585 } else { /* timer expired */ 586 error = ETIME; 587 } 588 } 589 break; 590 } 591 mutex_exit(&aiop->aio_mutex); 592 if (reqp) { 593 aphysio_unlock(reqp); 594 aio_copyout_result(reqp); 595 mutex_enter(&aiop->aio_mutex); 596 aio_req_free(aiop, reqp); 597 mutex_exit(&aiop->aio_mutex); 598 } 599 return (error); 600 } 601 602 /* 603 * aiowaitn can be used to reap completed asynchronous requests submitted with 604 * lio_listio, aio_read or aio_write. 605 * This function only reaps asynchronous raw I/Os. 606 */ 607 608 /*ARGSUSED*/ 609 static int 610 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout) 611 { 612 int error = 0; 613 aio_t *aiop; 614 aio_req_t *reqlist = NULL; 615 caddr_t iocblist = NULL; /* array of iocb ptr's */ 616 uint_t waitcnt, cnt = 0; /* iocb cnt */ 617 size_t iocbsz; /* users iocb size */ 618 size_t riocbsz; /* returned iocb size */ 619 int iocb_index = 0; 620 model_t model = get_udatamodel(); 621 int blocking = 1; 622 int timecheck; 623 timestruc_t rqtime; 624 timestruc_t *rqtp; 625 626 aiop = curproc->p_aio; 627 if (aiop == NULL) 628 return (EINVAL); 629 630 if (aiop->aio_outstanding == 0) 631 return (EAGAIN); 632 633 if (copyin(nwait, &waitcnt, sizeof (uint_t))) 634 return (EFAULT); 635 636 /* set *nwait to zero, if we must return prematurely */ 637 if (copyout(&cnt, nwait, sizeof (uint_t))) 638 return (EFAULT); 639 640 if (waitcnt == 0) { 641 blocking = 0; 642 rqtp = NULL; 643 waitcnt = nent; 644 } else { 645 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking); 646 if (error) 647 return (error); 648 } 649 650 if (model == DATAMODEL_NATIVE) 651 iocbsz = (sizeof (aiocb_t *) * nent); 652 #ifdef _SYSCALL32_IMPL 653 else 654 iocbsz = (sizeof (caddr32_t) * nent); 655 #endif /* _SYSCALL32_IMPL */ 656 657 /* 658 * Only one aio_waitn call is allowed at a time. 659 * The active aio_waitn will collect all requests 660 * out of the "done" list and if necessary it will wait 661 * for some/all pending requests to fulfill the nwait 662 * parameter. 663 * A second or further aio_waitn calls will sleep here 664 * until the active aio_waitn finishes and leaves the kernel 665 * If the second call does not block (poll), then return 666 * immediately with the error code : EAGAIN. 667 * If the second call should block, then sleep here, but 668 * do not touch the timeout. The timeout starts when this 669 * aio_waitn-call becomes active. 670 */ 671 672 mutex_enter(&aiop->aio_mutex); 673 674 while (aiop->aio_flags & AIO_WAITN) { 675 if (blocking == 0) { 676 mutex_exit(&aiop->aio_mutex); 677 return (EAGAIN); 678 } 679 680 /* block, no timeout */ 681 aiop->aio_flags |= AIO_WAITN_PENDING; 682 if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) { 683 mutex_exit(&aiop->aio_mutex); 684 return (EINTR); 685 } 686 } 687 688 /* 689 * Establish the absolute future time for the timeout. 690 */ 691 if (rqtp) { 692 timestruc_t now; 693 timecheck = timechanged; 694 gethrestime(&now); 695 timespecadd(rqtp, &now); 696 } 697 698 if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) { 699 kmem_free(aiop->aio_iocb, aiop->aio_iocbsz); 700 aiop->aio_iocb = NULL; 701 } 702 703 if (aiop->aio_iocb == NULL) { 704 iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP); 705 if (iocblist == NULL) { 706 mutex_exit(&aiop->aio_mutex); 707 return (ENOMEM); 708 } 709 aiop->aio_iocb = (aiocb_t **)iocblist; 710 aiop->aio_iocbsz = iocbsz; 711 } else { 712 iocblist = (char *)aiop->aio_iocb; 713 } 714 715 aiop->aio_waitncnt = waitcnt; 716 aiop->aio_flags |= AIO_WAITN; 717 718 for (;;) { 719 /* push requests on poll queue to done queue */ 720 if (aiop->aio_pollq) { 721 mutex_exit(&aiop->aio_mutex); 722 aio_cleanup(0); 723 mutex_enter(&aiop->aio_mutex); 724 } 725 726 /* check for requests on done queue */ 727 if (aiop->aio_doneq) { 728 cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt); 729 aiop->aio_waitncnt = waitcnt - cnt; 730 } 731 732 /* user-level done queue might not be empty */ 733 if (aiop->aio_notifycnt > 0) { 734 aiop->aio_notifycnt--; 735 error = 0; 736 break; 737 } 738 739 /* 740 * if we are here second time as a result of timer 741 * expiration, we reset error if there are enough 742 * aiocb's to satisfy request. 743 * We return also if all requests are already done 744 * and we picked up the whole done queue. 745 */ 746 747 if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 && 748 aiop->aio_doneq == NULL)) { 749 error = 0; 750 break; 751 } 752 753 if ((cnt < waitcnt) && blocking) { 754 int rval = cv_waituntil_sig(&aiop->aio_waitcv, 755 &aiop->aio_mutex, rqtp, timecheck); 756 if (rval > 0) 757 continue; 758 if (rval < 0) { 759 error = ETIME; 760 blocking = 0; 761 continue; 762 } 763 error = EINTR; 764 } 765 break; 766 } 767 768 mutex_exit(&aiop->aio_mutex); 769 770 if (cnt > 0) { 771 772 iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist, 773 aiop, model); 774 775 if (model == DATAMODEL_NATIVE) 776 riocbsz = (sizeof (aiocb_t *) * cnt); 777 #ifdef _SYSCALL32_IMPL 778 else 779 riocbsz = (sizeof (caddr32_t) * cnt); 780 #endif /* _SYSCALL32_IMPL */ 781 782 if (copyout(iocblist, uiocb, riocbsz) || 783 copyout(&cnt, nwait, sizeof (uint_t))) 784 error = EFAULT; 785 } 786 787 if (aiop->aio_iocbsz > AIO_IOCB_MAX) { 788 kmem_free(iocblist, aiop->aio_iocbsz); 789 aiop->aio_iocb = NULL; 790 } 791 792 /* check if there is another thread waiting for execution */ 793 mutex_enter(&aiop->aio_mutex); 794 aiop->aio_flags &= ~AIO_WAITN; 795 if (aiop->aio_flags & AIO_WAITN_PENDING) { 796 aiop->aio_flags &= ~AIO_WAITN_PENDING; 797 cv_signal(&aiop->aio_waitncv); 798 } 799 mutex_exit(&aiop->aio_mutex); 800 801 return (error); 802 } 803 804 /* 805 * aio_unlock_requests 806 * copyouts the result of the request as well as the return value. 807 * It builds the list of completed asynchronous requests, 808 * unlocks the allocated memory ranges and 809 * put the aio request structure back into the free list. 810 */ 811 812 static int 813 aio_unlock_requests( 814 caddr_t iocblist, 815 int iocb_index, 816 aio_req_t *reqlist, 817 aio_t *aiop, 818 model_t model) 819 { 820 aio_req_t *reqp, *nreqp; 821 822 if (model == DATAMODEL_NATIVE) { 823 for (reqp = reqlist; reqp != NULL; reqp = nreqp) { 824 (((caddr_t *)iocblist)[iocb_index++]) = 825 reqp->aio_req_iocb.iocb; 826 nreqp = reqp->aio_req_next; 827 aphysio_unlock(reqp); 828 aio_copyout_result(reqp); 829 mutex_enter(&aiop->aio_mutex); 830 aio_req_free(aiop, reqp); 831 mutex_exit(&aiop->aio_mutex); 832 } 833 } 834 #ifdef _SYSCALL32_IMPL 835 else { 836 for (reqp = reqlist; reqp != NULL; reqp = nreqp) { 837 ((caddr32_t *)iocblist)[iocb_index++] = 838 reqp->aio_req_iocb.iocb32; 839 nreqp = reqp->aio_req_next; 840 aphysio_unlock(reqp); 841 aio_copyout_result(reqp); 842 mutex_enter(&aiop->aio_mutex); 843 aio_req_free(aiop, reqp); 844 mutex_exit(&aiop->aio_mutex); 845 } 846 } 847 #endif /* _SYSCALL32_IMPL */ 848 return (iocb_index); 849 } 850 851 /* 852 * aio_reqlist_concat 853 * moves "max" elements from the done queue to the reqlist queue and removes 854 * the AIO_DONEQ flag. 855 * - reqlist queue is a simple linked list 856 * - done queue is a double linked list 857 */ 858 859 static int 860 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max) 861 { 862 aio_req_t *q2, *q2work, *list; 863 int count = 0; 864 865 list = *reqlist; 866 q2 = aiop->aio_doneq; 867 q2work = q2; 868 while (max-- > 0) { 869 q2work->aio_req_flags &= ~AIO_DONEQ; 870 q2work = q2work->aio_req_next; 871 count++; 872 if (q2work == q2) 873 break; 874 } 875 876 if (q2work == q2) { 877 /* all elements revised */ 878 q2->aio_req_prev->aio_req_next = list; 879 list = q2; 880 aiop->aio_doneq = NULL; 881 } else { 882 /* 883 * max < elements in the doneq 884 * detach only the required amount of elements 885 * out of the doneq 886 */ 887 q2work->aio_req_prev->aio_req_next = list; 888 list = q2; 889 890 aiop->aio_doneq = q2work; 891 q2work->aio_req_prev = q2->aio_req_prev; 892 q2->aio_req_prev->aio_req_next = q2work; 893 } 894 *reqlist = list; 895 return (count); 896 } 897 898 /*ARGSUSED*/ 899 static int 900 aiosuspend( 901 void *aiocb, 902 int nent, 903 struct timespec *timout, 904 int flag, 905 long *rval, 906 int run_mode) 907 { 908 int error; 909 aio_t *aiop; 910 aio_req_t *reqp, *found, *next; 911 caddr_t cbplist = NULL; 912 aiocb_t *cbp, **ucbp; 913 #ifdef _SYSCALL32_IMPL 914 aiocb32_t *cbp32; 915 caddr32_t *ucbp32; 916 #endif /* _SYSCALL32_IMPL */ 917 aiocb64_32_t *cbp64; 918 int rv; 919 int i; 920 size_t ssize; 921 model_t model = get_udatamodel(); 922 int blocking; 923 int timecheck; 924 timestruc_t rqtime; 925 timestruc_t *rqtp; 926 927 aiop = curproc->p_aio; 928 if (aiop == NULL || nent <= 0) 929 return (EINVAL); 930 931 /* 932 * Establish the absolute future time for the timeout. 933 */ 934 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking); 935 if (error) 936 return (error); 937 if (rqtp) { 938 timestruc_t now; 939 timecheck = timechanged; 940 gethrestime(&now); 941 timespecadd(rqtp, &now); 942 } 943 944 /* 945 * If we are not blocking and there's no IO complete 946 * skip aiocb copyin. 947 */ 948 if (!blocking && (aiop->aio_pollq == NULL) && 949 (aiop->aio_doneq == NULL)) { 950 return (EAGAIN); 951 } 952 953 if (model == DATAMODEL_NATIVE) 954 ssize = (sizeof (aiocb_t *) * nent); 955 #ifdef _SYSCALL32_IMPL 956 else 957 ssize = (sizeof (caddr32_t) * nent); 958 #endif /* _SYSCALL32_IMPL */ 959 960 cbplist = kmem_alloc(ssize, KM_NOSLEEP); 961 if (cbplist == NULL) 962 return (ENOMEM); 963 964 if (copyin(aiocb, cbplist, ssize)) { 965 error = EFAULT; 966 goto done; 967 } 968 969 found = NULL; 970 /* 971 * we need to get the aio_cleanupq_mutex since we call 972 * aio_req_done(). 973 */ 974 mutex_enter(&aiop->aio_cleanupq_mutex); 975 mutex_enter(&aiop->aio_mutex); 976 for (;;) { 977 /* push requests on poll queue to done queue */ 978 if (aiop->aio_pollq) { 979 mutex_exit(&aiop->aio_mutex); 980 mutex_exit(&aiop->aio_cleanupq_mutex); 981 aio_cleanup(0); 982 mutex_enter(&aiop->aio_cleanupq_mutex); 983 mutex_enter(&aiop->aio_mutex); 984 } 985 /* check for requests on done queue */ 986 if (aiop->aio_doneq) { 987 if (model == DATAMODEL_NATIVE) 988 ucbp = (aiocb_t **)cbplist; 989 #ifdef _SYSCALL32_IMPL 990 else 991 ucbp32 = (caddr32_t *)cbplist; 992 #endif /* _SYSCALL32_IMPL */ 993 for (i = 0; i < nent; i++) { 994 if (model == DATAMODEL_NATIVE) { 995 if ((cbp = *ucbp++) == NULL) 996 continue; 997 if (run_mode != AIO_LARGEFILE) 998 reqp = aio_req_done( 999 &cbp->aio_resultp); 1000 else { 1001 cbp64 = (aiocb64_32_t *)cbp; 1002 reqp = aio_req_done( 1003 &cbp64->aio_resultp); 1004 } 1005 } 1006 #ifdef _SYSCALL32_IMPL 1007 else { 1008 if (run_mode == AIO_32) { 1009 if ((cbp32 = 1010 (aiocb32_t *)(uintptr_t) 1011 *ucbp32++) == NULL) 1012 continue; 1013 reqp = aio_req_done( 1014 &cbp32->aio_resultp); 1015 } else if (run_mode == AIO_LARGEFILE) { 1016 if ((cbp64 = 1017 (aiocb64_32_t *)(uintptr_t) 1018 *ucbp32++) == NULL) 1019 continue; 1020 reqp = aio_req_done( 1021 &cbp64->aio_resultp); 1022 } 1023 1024 } 1025 #endif /* _SYSCALL32_IMPL */ 1026 if (reqp) { 1027 reqp->aio_req_next = found; 1028 found = reqp; 1029 } 1030 if (aiop->aio_doneq == NULL) 1031 break; 1032 } 1033 if (found) 1034 break; 1035 } 1036 if (aiop->aio_notifycnt > 0) { 1037 /* 1038 * nothing on the kernel's queue. the user 1039 * has notified the kernel that it has items 1040 * on a user-level queue. 1041 */ 1042 aiop->aio_notifycnt--; 1043 *rval = 1; 1044 error = 0; 1045 break; 1046 } 1047 /* don't block if nothing is outstanding */ 1048 if (aiop->aio_outstanding == 0) { 1049 error = EAGAIN; 1050 break; 1051 } 1052 if (blocking) { 1053 /* 1054 * drop the aio_cleanupq_mutex as we are 1055 * going to block. 1056 */ 1057 mutex_exit(&aiop->aio_cleanupq_mutex); 1058 rv = cv_waituntil_sig(&aiop->aio_waitcv, 1059 &aiop->aio_mutex, rqtp, timecheck); 1060 /* 1061 * we have to drop aio_mutex and 1062 * grab it in the right order. 1063 */ 1064 mutex_exit(&aiop->aio_mutex); 1065 mutex_enter(&aiop->aio_cleanupq_mutex); 1066 mutex_enter(&aiop->aio_mutex); 1067 if (rv > 0) /* check done queue again */ 1068 continue; 1069 if (rv == 0) /* interrupted by a signal */ 1070 error = EINTR; 1071 else /* timer expired */ 1072 error = ETIME; 1073 } else { 1074 error = EAGAIN; 1075 } 1076 break; 1077 } 1078 mutex_exit(&aiop->aio_mutex); 1079 mutex_exit(&aiop->aio_cleanupq_mutex); 1080 for (reqp = found; reqp != NULL; reqp = next) { 1081 next = reqp->aio_req_next; 1082 aphysio_unlock(reqp); 1083 aio_copyout_result(reqp); 1084 mutex_enter(&aiop->aio_mutex); 1085 aio_req_free(aiop, reqp); 1086 mutex_exit(&aiop->aio_mutex); 1087 } 1088 done: 1089 kmem_free(cbplist, ssize); 1090 return (error); 1091 } 1092 1093 /* 1094 * initialize aio by allocating an aio_t struct for this 1095 * process. 1096 */ 1097 static int 1098 aioinit(void) 1099 { 1100 proc_t *p = curproc; 1101 aio_t *aiop; 1102 mutex_enter(&p->p_lock); 1103 if ((aiop = p->p_aio) == NULL) { 1104 aiop = aio_aiop_alloc(); 1105 p->p_aio = aiop; 1106 } 1107 mutex_exit(&p->p_lock); 1108 if (aiop == NULL) 1109 return (ENOMEM); 1110 return (0); 1111 } 1112 1113 /* 1114 * start a special thread that will cleanup after aio requests 1115 * that are preventing a segment from being unmapped. as_unmap() 1116 * blocks until all phsyio to this segment is completed. this 1117 * doesn't happen until all the pages in this segment are not 1118 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio 1119 * requests still outstanding. this special thread will make sure 1120 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed. 1121 * 1122 * this function will return an error if the process has only 1123 * one LWP. the assumption is that the caller is a separate LWP 1124 * that remains blocked in the kernel for the life of this process. 1125 */ 1126 static int 1127 aiostart(void) 1128 { 1129 proc_t *p = curproc; 1130 aio_t *aiop; 1131 int first, error = 0; 1132 1133 if (p->p_lwpcnt == 1) 1134 return (EDEADLK); 1135 mutex_enter(&p->p_lock); 1136 if ((aiop = p->p_aio) == NULL) 1137 error = EINVAL; 1138 else { 1139 first = aiop->aio_ok; 1140 if (aiop->aio_ok == 0) 1141 aiop->aio_ok = 1; 1142 } 1143 mutex_exit(&p->p_lock); 1144 if (error == 0 && first == 0) { 1145 return (aio_cleanup_thread(aiop)); 1146 /* should return only to exit */ 1147 } 1148 return (error); 1149 } 1150 1151 /* 1152 * Associate an aiocb with a port. 1153 * This function is used by aiorw() to associate a transaction with a port. 1154 * Allocate an event port structure (port_alloc_event()) and store the 1155 * delivered user pointer (portnfy_user) in the portkev_user field of the 1156 * port_kevent_t structure.. 1157 * The aio_req_portkev pointer in the aio_req_t structure was added to identify 1158 * the port association. 1159 */ 1160 1161 static int 1162 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp, aio_req_t *reqp) 1163 { 1164 port_kevent_t *pkevp = NULL; 1165 int error; 1166 1167 error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT, 1168 PORT_SOURCE_AIO, &pkevp); 1169 if (error) { 1170 if ((error == ENOMEM) || (error == EAGAIN)) 1171 error = EAGAIN; 1172 else 1173 error = EINVAL; 1174 } else { 1175 port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user, 1176 aio_port_callback, reqp); 1177 reqp->aio_req_portkev = pkevp; 1178 reqp->aio_req_port = pntfy->portnfy_port; 1179 } 1180 return (error); 1181 } 1182 1183 /* 1184 * Associate an aiocb with a port. 1185 * This function is used by lio_listio() to associate a transaction with a port. 1186 * Allocate an event port structure (port_alloc_event()) and store the 1187 * delivered user pointer (portnfy_user) in the portkev_user field of the 1188 * The aio_req_portkev pointer in the aio_req_t structure was added to identify 1189 * the port association. 1190 * The event port notification can be requested attaching the port_notify_t 1191 * structure to the sigevent argument of lio_listio() or attaching the 1192 * port_notify_t structure to the sigevent structure which is embedded in the 1193 * aiocb. 1194 * The attachement to the global sigevent structure is valid for all aiocbs 1195 * in the list. 1196 */ 1197 1198 static int 1199 aio_req_assoc_port(struct sigevent *sigev, void *user, aiocb_t *cbp, 1200 aio_req_t *reqp, port_kevent_t *pkevtp) 1201 { 1202 port_kevent_t *pkevp = NULL; 1203 port_notify_t pntfy; 1204 int error; 1205 1206 if (sigev->sigev_notify == SIGEV_PORT) { 1207 /* aiocb has an own port notification embedded */ 1208 if (copyin((void *)sigev->sigev_value.sival_ptr, &pntfy, 1209 sizeof (port_notify_t))) 1210 return (EFAULT); 1211 1212 error = port_alloc_event(pntfy.portnfy_port, PORT_ALLOC_DEFAULT, 1213 PORT_SOURCE_AIO, &pkevp); 1214 if (error) { 1215 if ((error == ENOMEM) || (error == EAGAIN)) 1216 return (EAGAIN); 1217 else 1218 return (EINVAL); 1219 } 1220 /* use this values instead of the global values in port */ 1221 1222 port_init_event(pkevp, (uintptr_t)cbp, pntfy.portnfy_user, 1223 aio_port_callback, reqp); 1224 reqp->aio_req_port = pntfy.portnfy_port; 1225 } else { 1226 /* use global port notification */ 1227 error = port_dup_event(pkevtp, &pkevp, PORT_ALLOC_DEFAULT); 1228 if (error) 1229 return (EAGAIN); 1230 port_init_event(pkevp, (uintptr_t)cbp, user, aio_port_callback, 1231 reqp); 1232 } 1233 reqp->aio_req_portkev = pkevp; 1234 return (0); 1235 } 1236 1237 /* 1238 * Same comments as in aio_req_assoc_port(), see above. 1239 */ 1240 1241 static int 1242 aio_req_assoc_port32(struct sigevent32 *sigev, void *user, aiocb_t *cbp, 1243 aio_req_t *reqp, port_kevent_t *pkevtp) 1244 { 1245 port_kevent_t *pkevp = NULL; 1246 port_notify32_t pntfy; 1247 int error; 1248 1249 if (sigev->sigev_notify == SIGEV_PORT) { 1250 if (copyin((void *)(uintptr_t)sigev->sigev_value.sival_int, 1251 &pntfy, sizeof (port_notify32_t))) 1252 return (EFAULT); 1253 1254 error = port_alloc_event(pntfy.portnfy_port, 1255 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevp); 1256 if (error) { 1257 if ((error == ENOMEM) || (error == EAGAIN)) 1258 return (EAGAIN); 1259 else 1260 return (EINVAL); 1261 } 1262 /* use this values instead of the global values in port */ 1263 1264 port_init_event(pkevp, (uintptr_t)cbp, 1265 (void *)(uintptr_t)pntfy.portnfy_user, 1266 aio_port_callback, reqp); 1267 reqp->aio_req_port = pntfy.portnfy_port; 1268 } else { 1269 error = port_dup_event(pkevtp, &pkevp, PORT_ALLOC_DEFAULT); 1270 if (error) 1271 return (EAGAIN); 1272 port_init_event(pkevp, (uintptr_t)cbp, user, aio_port_callback, 1273 reqp); 1274 } 1275 reqp->aio_req_portkev = pkevp; 1276 return (0); 1277 } 1278 1279 1280 #ifdef _LP64 1281 1282 /* 1283 * Asynchronous list IO. A chain of aiocb's are copied in 1284 * one at a time. If the aiocb is invalid, it is skipped. 1285 * For each aiocb, the appropriate driver entry point is 1286 * called. Optimize for the common case where the list 1287 * of requests is to the same file descriptor. 1288 * 1289 * One possible optimization is to define a new driver entry 1290 * point that supports a list of IO requests. Whether this 1291 * improves performance depends somewhat on the driver's 1292 * locking strategy. Processing a list could adversely impact 1293 * the driver's interrupt latency. 1294 */ 1295 /*ARGSUSED*/ 1296 static int 1297 alio( 1298 int opcode, 1299 int mode_arg, 1300 aiocb_t **aiocb_arg, 1301 int nent, 1302 struct sigevent *sigev) 1303 1304 { 1305 file_t *fp; 1306 file_t *prev_fp = NULL; 1307 int prev_mode = -1; 1308 struct vnode *vp; 1309 aio_lio_t *head; 1310 aio_req_t *reqp; 1311 aio_t *aiop; 1312 caddr_t cbplist; 1313 aiocb_t *cbp, **ucbp; 1314 aiocb_t cb; 1315 aiocb_t *aiocb = &cb; 1316 struct sigevent sigevk; 1317 sigqueue_t *sqp; 1318 int (*aio_func)(); 1319 int mode; 1320 int error = 0; 1321 int aio_errors = 0; 1322 int i; 1323 size_t ssize; 1324 int deadhead = 0; 1325 int aio_notsupported = 0; 1326 int aio_use_port = 0; 1327 port_kevent_t *pkevtp = NULL; 1328 port_notify_t pnotify; 1329 1330 aiop = curproc->p_aio; 1331 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 1332 return (EINVAL); 1333 1334 ssize = (sizeof (aiocb_t *) * nent); 1335 cbplist = kmem_alloc(ssize, KM_SLEEP); 1336 ucbp = (aiocb_t **)cbplist; 1337 1338 if (copyin(aiocb_arg, cbplist, sizeof (aiocb_t *) * nent)) { 1339 kmem_free(cbplist, ssize); 1340 return (EFAULT); 1341 } 1342 1343 if (sigev) { 1344 if (copyin(sigev, &sigevk, sizeof (struct sigevent))) { 1345 kmem_free(cbplist, ssize); 1346 return (EFAULT); 1347 } 1348 } 1349 1350 /* 1351 * a list head should be allocated if notification is 1352 * enabled for this list. 1353 */ 1354 head = NULL; 1355 1356 /* Event Ports */ 1357 1358 if (sigev && sigevk.sigev_notify == SIGEV_PORT) { 1359 /* Use port for completion notification */ 1360 if (copyin(sigevk.sigev_value.sival_ptr, &pnotify, 1361 sizeof (port_notify_t))) { 1362 kmem_free(cbplist, ssize); 1363 return (EFAULT); 1364 } 1365 /* use event ports for the list of aiocbs */ 1366 aio_use_port = 1; 1367 error = port_alloc_event(pnotify.portnfy_port, 1368 PORT_ALLOC_PRIVATE, PORT_SOURCE_AIO, &pkevtp); 1369 if (error) { 1370 if ((error == ENOMEM) || (error == EAGAIN)) 1371 error = EAGAIN; 1372 else 1373 error = EINVAL; 1374 kmem_free(cbplist, ssize); 1375 return (error); 1376 } 1377 } else if ((mode_arg == LIO_WAIT) || sigev) { 1378 mutex_enter(&aiop->aio_mutex); 1379 error = aio_lio_alloc(&head); 1380 mutex_exit(&aiop->aio_mutex); 1381 if (error) 1382 goto done; 1383 deadhead = 1; 1384 head->lio_nent = nent; 1385 head->lio_refcnt = nent; 1386 if (sigev && (sigevk.sigev_notify == SIGEV_SIGNAL) && 1387 (sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG)) { 1388 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 1389 if (sqp == NULL) { 1390 error = EAGAIN; 1391 goto done; 1392 } 1393 sqp->sq_func = NULL; 1394 sqp->sq_next = NULL; 1395 sqp->sq_info.si_code = SI_ASYNCIO; 1396 sqp->sq_info.si_pid = curproc->p_pid; 1397 sqp->sq_info.si_ctid = PRCTID(curproc); 1398 sqp->sq_info.si_zoneid = getzoneid(); 1399 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 1400 sqp->sq_info.si_signo = sigevk.sigev_signo; 1401 sqp->sq_info.si_value = sigevk.sigev_value; 1402 head->lio_sigqp = sqp; 1403 } else { 1404 head->lio_sigqp = NULL; 1405 } 1406 } 1407 1408 for (i = 0; i < nent; i++, ucbp++) { 1409 1410 cbp = *ucbp; 1411 /* skip entry if it can't be copied. */ 1412 if (cbp == NULL || copyin(cbp, aiocb, sizeof (aiocb_t))) { 1413 if (head) { 1414 mutex_enter(&aiop->aio_mutex); 1415 head->lio_nent--; 1416 head->lio_refcnt--; 1417 mutex_exit(&aiop->aio_mutex); 1418 } 1419 continue; 1420 } 1421 1422 /* skip if opcode for aiocb is LIO_NOP */ 1423 1424 mode = aiocb->aio_lio_opcode; 1425 if (mode == LIO_NOP) { 1426 cbp = NULL; 1427 if (head) { 1428 mutex_enter(&aiop->aio_mutex); 1429 head->lio_nent--; 1430 head->lio_refcnt--; 1431 mutex_exit(&aiop->aio_mutex); 1432 } 1433 continue; 1434 } 1435 1436 /* increment file descriptor's ref count. */ 1437 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 1438 lio_set_uerror(&cbp->aio_resultp, EBADF); 1439 if (head) { 1440 mutex_enter(&aiop->aio_mutex); 1441 head->lio_nent--; 1442 head->lio_refcnt--; 1443 mutex_exit(&aiop->aio_mutex); 1444 } 1445 aio_errors++; 1446 continue; 1447 } 1448 1449 vp = fp->f_vnode; 1450 1451 /* 1452 * check the permission of the partition 1453 */ 1454 mode = aiocb->aio_lio_opcode; 1455 if ((fp->f_flag & mode) == 0) { 1456 releasef(aiocb->aio_fildes); 1457 lio_set_uerror(&cbp->aio_resultp, EBADF); 1458 if (head) { 1459 mutex_enter(&aiop->aio_mutex); 1460 head->lio_nent--; 1461 head->lio_refcnt--; 1462 mutex_exit(&aiop->aio_mutex); 1463 } 1464 aio_errors++; 1465 continue; 1466 } 1467 1468 /* 1469 * common case where requests are to the same fd for the 1470 * same r/w operation. 1471 * for UFS, need to set EBADFD 1472 */ 1473 if ((fp != prev_fp) || (mode != prev_mode)) { 1474 aio_func = check_vp(vp, mode); 1475 if (aio_func == NULL) { 1476 prev_fp = NULL; 1477 releasef(aiocb->aio_fildes); 1478 lio_set_uerror(&cbp->aio_resultp, EBADFD); 1479 aio_notsupported++; 1480 if (head) { 1481 mutex_enter(&aiop->aio_mutex); 1482 head->lio_nent--; 1483 head->lio_refcnt--; 1484 mutex_exit(&aiop->aio_mutex); 1485 } 1486 continue; 1487 } else { 1488 prev_fp = fp; 1489 prev_mode = mode; 1490 } 1491 } 1492 1493 if (error = aio_req_setup(&reqp, aiop, aiocb, 1494 &cbp->aio_resultp, aio_use_port, vp)) { 1495 releasef(aiocb->aio_fildes); 1496 lio_set_uerror(&cbp->aio_resultp, error); 1497 if (head) { 1498 mutex_enter(&aiop->aio_mutex); 1499 head->lio_nent--; 1500 head->lio_refcnt--; 1501 mutex_exit(&aiop->aio_mutex); 1502 } 1503 aio_errors++; 1504 continue; 1505 } 1506 1507 reqp->aio_req_lio = head; 1508 deadhead = 0; 1509 1510 /* 1511 * Set the errno field now before sending the request to 1512 * the driver to avoid a race condition 1513 */ 1514 (void) suword32(&cbp->aio_resultp.aio_errno, 1515 EINPROGRESS); 1516 1517 reqp->aio_req_iocb.iocb = (caddr_t)cbp; 1518 1519 if (aio_use_port) { 1520 reqp->aio_req_port = pnotify.portnfy_port; 1521 error = aio_req_assoc_port(&aiocb->aio_sigevent, 1522 pnotify.portnfy_user, cbp, reqp, pkevtp); 1523 } 1524 1525 /* 1526 * send the request to driver. 1527 * Clustering: If PXFS vnode, call PXFS function. 1528 */ 1529 if (error == 0) { 1530 if (aiocb->aio_nbytes == 0) { 1531 clear_active_fd(aiocb->aio_fildes); 1532 aio_zerolen(reqp); 1533 continue; 1534 } 1535 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 1536 CRED()); 1537 } 1538 /* 1539 * the fd's ref count is not decremented until the IO has 1540 * completed unless there was an error. 1541 */ 1542 if (error) { 1543 releasef(aiocb->aio_fildes); 1544 lio_set_uerror(&cbp->aio_resultp, error); 1545 if (head) { 1546 mutex_enter(&aiop->aio_mutex); 1547 head->lio_nent--; 1548 head->lio_refcnt--; 1549 mutex_exit(&aiop->aio_mutex); 1550 } 1551 if (error == ENOTSUP) 1552 aio_notsupported++; 1553 else 1554 aio_errors++; 1555 lio_set_error(reqp); 1556 } else { 1557 clear_active_fd(aiocb->aio_fildes); 1558 } 1559 } 1560 1561 if (pkevtp) 1562 port_free_event(pkevtp); 1563 1564 if (aio_notsupported) { 1565 error = ENOTSUP; 1566 } else if (aio_errors) { 1567 /* 1568 * return EIO if any request failed 1569 */ 1570 error = EIO; 1571 } 1572 1573 if (mode_arg == LIO_WAIT) { 1574 mutex_enter(&aiop->aio_mutex); 1575 while (head->lio_refcnt > 0) { 1576 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 1577 mutex_exit(&aiop->aio_mutex); 1578 error = EINTR; 1579 goto done; 1580 } 1581 } 1582 mutex_exit(&aiop->aio_mutex); 1583 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64); 1584 } 1585 1586 done: 1587 kmem_free(cbplist, ssize); 1588 if (deadhead) { 1589 if (head->lio_sigqp) 1590 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 1591 kmem_free(head, sizeof (aio_lio_t)); 1592 } 1593 return (error); 1594 } 1595 1596 #endif /* _LP64 */ 1597 1598 /* 1599 * Asynchronous list IO. 1600 * If list I/O is called with LIO_WAIT it can still return 1601 * before all the I/O's are completed if a signal is caught 1602 * or if the list include UFS I/O requests. If this happens, 1603 * libaio will call aliowait() to wait for the I/O's to 1604 * complete 1605 */ 1606 /*ARGSUSED*/ 1607 static int 1608 aliowait( 1609 int mode, 1610 void *aiocb, 1611 int nent, 1612 void *sigev, 1613 int run_mode) 1614 { 1615 aio_lio_t *head; 1616 aio_t *aiop; 1617 caddr_t cbplist; 1618 aiocb_t *cbp, **ucbp; 1619 #ifdef _SYSCALL32_IMPL 1620 aiocb32_t *cbp32; 1621 caddr32_t *ucbp32; 1622 aiocb64_32_t *cbp64; 1623 #endif 1624 int error = 0; 1625 int i; 1626 size_t ssize = 0; 1627 model_t model = get_udatamodel(); 1628 1629 aiop = curproc->p_aio; 1630 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 1631 return (EINVAL); 1632 1633 if (model == DATAMODEL_NATIVE) 1634 ssize = (sizeof (aiocb_t *) * nent); 1635 #ifdef _SYSCALL32_IMPL 1636 else 1637 ssize = (sizeof (caddr32_t) * nent); 1638 #endif /* _SYSCALL32_IMPL */ 1639 1640 if (ssize == 0) 1641 return (EINVAL); 1642 1643 cbplist = kmem_alloc(ssize, KM_SLEEP); 1644 1645 if (model == DATAMODEL_NATIVE) 1646 ucbp = (aiocb_t **)cbplist; 1647 #ifdef _SYSCALL32_IMPL 1648 else 1649 ucbp32 = (caddr32_t *)cbplist; 1650 #endif /* _SYSCALL32_IMPL */ 1651 1652 if (copyin(aiocb, cbplist, ssize)) { 1653 error = EFAULT; 1654 goto done; 1655 } 1656 1657 /* 1658 * To find the list head, we go through the 1659 * list of aiocb structs, find the request 1660 * its for, then get the list head that reqp 1661 * points to 1662 */ 1663 head = NULL; 1664 1665 for (i = 0; i < nent; i++) { 1666 if (model == DATAMODEL_NATIVE) { 1667 /* 1668 * Since we are only checking for a NULL pointer 1669 * Following should work on both native data sizes 1670 * as well as for largefile aiocb. 1671 */ 1672 if ((cbp = *ucbp++) == NULL) 1673 continue; 1674 if (run_mode != AIO_LARGEFILE) 1675 if (head = aio_list_get(&cbp->aio_resultp)) 1676 break; 1677 else { 1678 /* 1679 * This is a case when largefile call is 1680 * made on 32 bit kernel. 1681 * Treat each pointer as pointer to 1682 * aiocb64_32 1683 */ 1684 if (head = aio_list_get((aio_result_t *) 1685 &(((aiocb64_32_t *)cbp)->aio_resultp))) 1686 break; 1687 } 1688 } 1689 #ifdef _SYSCALL32_IMPL 1690 else { 1691 if (run_mode == AIO_LARGEFILE) { 1692 if ((cbp64 = (aiocb64_32_t *) 1693 (uintptr_t)*ucbp32++) == NULL) 1694 continue; 1695 if (head = aio_list_get((aio_result_t *) 1696 &cbp64->aio_resultp)) 1697 break; 1698 } else if (run_mode == AIO_32) { 1699 if ((cbp32 = (aiocb32_t *) 1700 (uintptr_t)*ucbp32++) == NULL) 1701 continue; 1702 if (head = aio_list_get((aio_result_t *) 1703 &cbp32->aio_resultp)) 1704 break; 1705 } 1706 } 1707 #endif /* _SYSCALL32_IMPL */ 1708 } 1709 1710 if (head == NULL) { 1711 error = EINVAL; 1712 goto done; 1713 } 1714 1715 mutex_enter(&aiop->aio_mutex); 1716 while (head->lio_refcnt > 0) { 1717 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 1718 mutex_exit(&aiop->aio_mutex); 1719 error = EINTR; 1720 goto done; 1721 } 1722 } 1723 mutex_exit(&aiop->aio_mutex); 1724 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode); 1725 done: 1726 kmem_free(cbplist, ssize); 1727 return (error); 1728 } 1729 1730 aio_lio_t * 1731 aio_list_get(aio_result_t *resultp) 1732 { 1733 aio_lio_t *head = NULL; 1734 aio_t *aiop; 1735 aio_req_t **bucket; 1736 aio_req_t *reqp; 1737 long index; 1738 1739 aiop = curproc->p_aio; 1740 if (aiop == NULL) 1741 return (NULL); 1742 1743 if (resultp) { 1744 index = AIO_HASH(resultp); 1745 bucket = &aiop->aio_hash[index]; 1746 for (reqp = *bucket; reqp != NULL; 1747 reqp = reqp->aio_hash_next) { 1748 if (reqp->aio_req_resultp == resultp) { 1749 head = reqp->aio_req_lio; 1750 return (head); 1751 } 1752 } 1753 } 1754 return (NULL); 1755 } 1756 1757 1758 static void 1759 lio_set_uerror(void *resultp, int error) 1760 { 1761 /* 1762 * the resultp field is a pointer to where the 1763 * error should be written out to the user's 1764 * aiocb. 1765 * 1766 */ 1767 if (get_udatamodel() == DATAMODEL_NATIVE) { 1768 (void) sulword(&((aio_result_t *)resultp)->aio_return, 1769 (ssize_t)-1); 1770 (void) suword32(&((aio_result_t *)resultp)->aio_errno, error); 1771 } 1772 #ifdef _SYSCALL32_IMPL 1773 else { 1774 (void) suword32(&((aio_result32_t *)resultp)->aio_return, 1775 (uint_t)-1); 1776 (void) suword32(&((aio_result32_t *)resultp)->aio_errno, error); 1777 } 1778 #endif /* _SYSCALL32_IMPL */ 1779 } 1780 1781 /* 1782 * do cleanup completion for all requests in list. memory for 1783 * each request is also freed. 1784 */ 1785 static void 1786 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode) 1787 { 1788 int i; 1789 aio_req_t *reqp; 1790 aio_result_t *resultp; 1791 aiocb64_32_t *aiocb_64; 1792 1793 for (i = 0; i < nent; i++) { 1794 if (get_udatamodel() == DATAMODEL_NATIVE) { 1795 if (cbp[i] == NULL) 1796 continue; 1797 if (run_mode == AIO_LARGEFILE) { 1798 aiocb_64 = (aiocb64_32_t *)cbp[i]; 1799 resultp = (aio_result_t *)&aiocb_64-> 1800 aio_resultp; 1801 } else 1802 resultp = &cbp[i]->aio_resultp; 1803 } 1804 #ifdef _SYSCALL32_IMPL 1805 else { 1806 aiocb32_t *aiocb_32; 1807 caddr32_t *cbp32; 1808 1809 cbp32 = (caddr32_t *)cbp; 1810 if (cbp32[i] == NULL) 1811 continue; 1812 if (run_mode == AIO_32) { 1813 aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i]; 1814 resultp = (aio_result_t *)&aiocb_32-> 1815 aio_resultp; 1816 } else if (run_mode == AIO_LARGEFILE) { 1817 aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i]; 1818 resultp = (aio_result_t *)&aiocb_64-> 1819 aio_resultp; 1820 } 1821 } 1822 #endif /* _SYSCALL32_IMPL */ 1823 /* 1824 * we need to get the aio_cleanupq_mutex since we call 1825 * aio_req_done(). 1826 */ 1827 mutex_enter(&aiop->aio_cleanupq_mutex); 1828 mutex_enter(&aiop->aio_mutex); 1829 reqp = aio_req_done(resultp); 1830 mutex_exit(&aiop->aio_mutex); 1831 mutex_exit(&aiop->aio_cleanupq_mutex); 1832 if (reqp != NULL) { 1833 aphysio_unlock(reqp); 1834 aio_copyout_result(reqp); 1835 mutex_enter(&aiop->aio_mutex); 1836 aio_req_free(aiop, reqp); 1837 mutex_exit(&aiop->aio_mutex); 1838 } 1839 } 1840 } 1841 1842 /* 1843 * write out the results for an aio request that is 1844 * done. 1845 */ 1846 static int 1847 aioerror(void *cb, int run_mode) 1848 { 1849 aio_result_t *resultp; 1850 aio_t *aiop; 1851 aio_req_t *reqp; 1852 int retval; 1853 1854 aiop = curproc->p_aio; 1855 if (aiop == NULL || cb == NULL) 1856 return (EINVAL); 1857 1858 if (get_udatamodel() == DATAMODEL_NATIVE) { 1859 if (run_mode == AIO_LARGEFILE) 1860 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)-> 1861 aio_resultp; 1862 else 1863 resultp = &((aiocb_t *)cb)->aio_resultp; 1864 } 1865 #ifdef _SYSCALL32_IMPL 1866 else { 1867 if (run_mode == AIO_LARGEFILE) 1868 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)-> 1869 aio_resultp; 1870 else if (run_mode == AIO_32) 1871 resultp = (aio_result_t *)&((aiocb32_t *)cb)-> 1872 aio_resultp; 1873 } 1874 #endif /* _SYSCALL32_IMPL */ 1875 /* 1876 * we need to get the aio_cleanupq_mutex since we call 1877 * aio_req_find(). 1878 */ 1879 mutex_enter(&aiop->aio_cleanupq_mutex); 1880 mutex_enter(&aiop->aio_mutex); 1881 retval = aio_req_find(resultp, &reqp); 1882 mutex_exit(&aiop->aio_mutex); 1883 mutex_exit(&aiop->aio_cleanupq_mutex); 1884 if (retval == 0) { 1885 aphysio_unlock(reqp); 1886 aio_copyout_result(reqp); 1887 mutex_enter(&aiop->aio_mutex); 1888 aio_req_free(aiop, reqp); 1889 mutex_exit(&aiop->aio_mutex); 1890 return (0); 1891 } else if (retval == 1) 1892 return (EINPROGRESS); 1893 else if (retval == 2) 1894 return (EINVAL); 1895 return (0); 1896 } 1897 1898 /* 1899 * aio_cancel - if no requests outstanding, 1900 * return AIO_ALLDONE 1901 * else 1902 * return AIO_NOTCANCELED 1903 */ 1904 static int 1905 aio_cancel( 1906 int fildes, 1907 void *cb, 1908 long *rval, 1909 int run_mode) 1910 { 1911 aio_t *aiop; 1912 void *resultp; 1913 int index; 1914 aio_req_t **bucket; 1915 aio_req_t *ent; 1916 1917 1918 /* 1919 * Verify valid file descriptor 1920 */ 1921 if ((getf(fildes)) == NULL) { 1922 return (EBADF); 1923 } 1924 releasef(fildes); 1925 1926 aiop = curproc->p_aio; 1927 if (aiop == NULL) 1928 return (EINVAL); 1929 1930 if (aiop->aio_outstanding == 0) { 1931 *rval = AIO_ALLDONE; 1932 return (0); 1933 } 1934 1935 mutex_enter(&aiop->aio_mutex); 1936 if (cb != NULL) { 1937 if (get_udatamodel() == DATAMODEL_NATIVE) { 1938 if (run_mode == AIO_LARGEFILE) 1939 resultp = (aio_result_t *)&((aiocb64_32_t *)cb) 1940 ->aio_resultp; 1941 else 1942 resultp = &((aiocb_t *)cb)->aio_resultp; 1943 } 1944 #ifdef _SYSCALL32_IMPL 1945 else { 1946 if (run_mode == AIO_LARGEFILE) 1947 resultp = (aio_result_t *)&((aiocb64_32_t *)cb) 1948 ->aio_resultp; 1949 else if (run_mode == AIO_32) 1950 resultp = (aio_result_t *)&((aiocb32_t *)cb) 1951 ->aio_resultp; 1952 } 1953 #endif /* _SYSCALL32_IMPL */ 1954 index = AIO_HASH(resultp); 1955 bucket = &aiop->aio_hash[index]; 1956 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 1957 if (ent->aio_req_resultp == resultp) { 1958 if ((ent->aio_req_flags & AIO_PENDING) == 0) { 1959 mutex_exit(&aiop->aio_mutex); 1960 *rval = AIO_ALLDONE; 1961 return (0); 1962 } 1963 mutex_exit(&aiop->aio_mutex); 1964 *rval = AIO_NOTCANCELED; 1965 return (0); 1966 } 1967 } 1968 mutex_exit(&aiop->aio_mutex); 1969 *rval = AIO_ALLDONE; 1970 return (0); 1971 } 1972 1973 for (index = 0; index < AIO_HASHSZ; index++) { 1974 bucket = &aiop->aio_hash[index]; 1975 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 1976 if (ent->aio_req_fd == fildes) { 1977 if ((ent->aio_req_flags & AIO_PENDING) != 0) { 1978 mutex_exit(&aiop->aio_mutex); 1979 *rval = AIO_NOTCANCELED; 1980 return (0); 1981 } 1982 } 1983 } 1984 } 1985 mutex_exit(&aiop->aio_mutex); 1986 *rval = AIO_ALLDONE; 1987 return (0); 1988 } 1989 1990 /* 1991 * solaris version of asynchronous read and write 1992 */ 1993 static int 1994 arw( 1995 int opcode, 1996 int fdes, 1997 char *bufp, 1998 int bufsize, 1999 offset_t offset, 2000 aio_result_t *resultp, 2001 int mode) 2002 { 2003 file_t *fp; 2004 int error; 2005 struct vnode *vp; 2006 aio_req_t *reqp; 2007 aio_t *aiop; 2008 int (*aio_func)(); 2009 #ifdef _LP64 2010 aiocb_t aiocb; 2011 #else 2012 aiocb64_32_t aiocb64; 2013 #endif 2014 2015 aiop = curproc->p_aio; 2016 if (aiop == NULL) 2017 return (EINVAL); 2018 2019 if ((fp = getf(fdes)) == NULL) { 2020 return (EBADF); 2021 } 2022 2023 /* 2024 * check the permission of the partition 2025 */ 2026 if ((fp->f_flag & mode) == 0) { 2027 releasef(fdes); 2028 return (EBADF); 2029 } 2030 2031 vp = fp->f_vnode; 2032 aio_func = check_vp(vp, mode); 2033 if (aio_func == NULL) { 2034 releasef(fdes); 2035 return (EBADFD); 2036 } 2037 #ifdef _LP64 2038 aiocb.aio_fildes = fdes; 2039 aiocb.aio_buf = bufp; 2040 aiocb.aio_nbytes = bufsize; 2041 aiocb.aio_offset = offset; 2042 aiocb.aio_sigevent.sigev_notify = 0; 2043 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, 0, vp); 2044 #else 2045 aiocb64.aio_fildes = fdes; 2046 aiocb64.aio_buf = (caddr32_t)bufp; 2047 aiocb64.aio_nbytes = bufsize; 2048 aiocb64.aio_offset = offset; 2049 aiocb64.aio_sigevent.sigev_notify = 0; 2050 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, 0, vp); 2051 #endif 2052 if (error) { 2053 releasef(fdes); 2054 return (error); 2055 } 2056 2057 /* 2058 * enable polling on this request if the opcode has 2059 * the AIO poll bit set 2060 */ 2061 if (opcode & AIO_POLL_BIT) 2062 reqp->aio_req_flags |= AIO_POLL; 2063 2064 if (bufsize == 0) { 2065 clear_active_fd(fdes); 2066 aio_zerolen(reqp); 2067 return (0); 2068 } 2069 /* 2070 * send the request to driver. 2071 * Clustering: If PXFS vnode, call PXFS function. 2072 */ 2073 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED()); 2074 /* 2075 * the fd is stored in the aio_req_t by aio_req_setup(), and 2076 * is released by the aio_cleanup_thread() when the IO has 2077 * completed. 2078 */ 2079 if (error) { 2080 releasef(fdes); 2081 mutex_enter(&aiop->aio_mutex); 2082 aio_req_free(aiop, reqp); 2083 aiop->aio_pending--; 2084 if (aiop->aio_flags & AIO_REQ_BLOCK) 2085 cv_signal(&aiop->aio_cleanupcv); 2086 mutex_exit(&aiop->aio_mutex); 2087 return (error); 2088 } 2089 clear_active_fd(fdes); 2090 return (0); 2091 } 2092 2093 /* 2094 * Take request out of the port pending queue ... 2095 */ 2096 2097 void 2098 aio_deq_port_pending(aio_t *aiop, aio_req_t *reqp) 2099 { 2100 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2101 if (reqp->aio_req_prev == NULL) 2102 /* first request */ 2103 aiop->aio_portpending = reqp->aio_req_next; 2104 else 2105 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next; 2106 if (reqp->aio_req_next != NULL) 2107 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev; 2108 } 2109 2110 /* 2111 * posix version of asynchronous read and write 2112 */ 2113 static int 2114 aiorw( 2115 int opcode, 2116 void *aiocb_arg, 2117 int mode, 2118 int run_mode) 2119 { 2120 #ifdef _SYSCALL32_IMPL 2121 aiocb32_t aiocb32; 2122 struct sigevent32 *sigev32; 2123 port_notify32_t pntfy32; 2124 #endif 2125 aiocb64_32_t aiocb64; 2126 aiocb_t aiocb; 2127 file_t *fp; 2128 int error, fd; 2129 size_t bufsize; 2130 struct vnode *vp; 2131 aio_req_t *reqp; 2132 aio_t *aiop; 2133 int (*aio_func)(); 2134 aio_result_t *resultp; 2135 struct sigevent *sigev; 2136 model_t model; 2137 int aio_use_port = 0; 2138 port_notify_t pntfy; 2139 2140 model = get_udatamodel(); 2141 aiop = curproc->p_aio; 2142 if (aiop == NULL) 2143 return (EINVAL); 2144 2145 if (model == DATAMODEL_NATIVE) { 2146 if (run_mode != AIO_LARGEFILE) { 2147 if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t))) 2148 return (EFAULT); 2149 bufsize = aiocb.aio_nbytes; 2150 resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp); 2151 if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) { 2152 return (EBADF); 2153 } 2154 sigev = &aiocb.aio_sigevent; 2155 } else { 2156 /* 2157 * We come here only when we make largefile 2158 * call on 32 bit kernel using 32 bit library. 2159 */ 2160 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t))) 2161 return (EFAULT); 2162 bufsize = aiocb64.aio_nbytes; 2163 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg) 2164 ->aio_resultp); 2165 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) { 2166 return (EBADF); 2167 } 2168 sigev = (struct sigevent *)&aiocb64.aio_sigevent; 2169 } 2170 2171 if (sigev->sigev_notify == SIGEV_PORT) { 2172 if (copyin((void *)sigev->sigev_value.sival_ptr, 2173 &pntfy, sizeof (port_notify_t))) { 2174 releasef(fd); 2175 return (EFAULT); 2176 } 2177 aio_use_port = 1; 2178 } 2179 } 2180 #ifdef _SYSCALL32_IMPL 2181 else { 2182 if (run_mode == AIO_32) { 2183 /* 32 bit system call is being made on 64 bit kernel */ 2184 if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t))) 2185 return (EFAULT); 2186 2187 bufsize = aiocb32.aio_nbytes; 2188 aiocb_32ton(&aiocb32, &aiocb); 2189 resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)-> 2190 aio_resultp); 2191 if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) { 2192 return (EBADF); 2193 } 2194 sigev32 = &aiocb32.aio_sigevent; 2195 } else if (run_mode == AIO_LARGEFILE) { 2196 /* 2197 * We come here only when we make largefile 2198 * call on 64 bit kernel using 32 bit library. 2199 */ 2200 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t))) 2201 return (EFAULT); 2202 bufsize = aiocb64.aio_nbytes; 2203 aiocb_LFton(&aiocb64, &aiocb); 2204 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg) 2205 ->aio_resultp); 2206 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) 2207 return (EBADF); 2208 sigev32 = &aiocb64.aio_sigevent; 2209 } 2210 2211 if (sigev32->sigev_notify == SIGEV_PORT) { 2212 if (copyin( 2213 (void *)(uintptr_t)sigev32->sigev_value.sival_ptr, 2214 &pntfy32, sizeof (port_notify32_t))) { 2215 releasef(fd); 2216 return (EFAULT); 2217 } 2218 pntfy.portnfy_port = pntfy32.portnfy_port; 2219 pntfy.portnfy_user = 2220 (void *)(uintptr_t)pntfy32.portnfy_user; 2221 aio_use_port = 1; 2222 } 2223 } 2224 #endif /* _SYSCALL32_IMPL */ 2225 2226 /* 2227 * check the permission of the partition 2228 */ 2229 2230 if ((fp->f_flag & mode) == 0) { 2231 releasef(fd); 2232 return (EBADF); 2233 } 2234 2235 vp = fp->f_vnode; 2236 aio_func = check_vp(vp, mode); 2237 if (aio_func == NULL) { 2238 releasef(fd); 2239 return (EBADFD); 2240 } 2241 if ((model == DATAMODEL_NATIVE) && (run_mode == AIO_LARGEFILE)) 2242 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, 2243 aio_use_port, vp); 2244 else 2245 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, 2246 aio_use_port, vp); 2247 2248 if (error) { 2249 releasef(fd); 2250 return (error); 2251 } 2252 /* 2253 * enable polling on this request if the opcode has 2254 * the AIO poll bit set 2255 */ 2256 if (opcode & AIO_POLL_BIT) 2257 reqp->aio_req_flags |= AIO_POLL; 2258 2259 if (model == DATAMODEL_NATIVE) 2260 reqp->aio_req_iocb.iocb = aiocb_arg; 2261 #ifdef _SYSCALL32_IMPL 2262 else 2263 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg; 2264 #endif 2265 2266 if (aio_use_port) 2267 error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp); 2268 2269 /* 2270 * send the request to driver. 2271 * Clustering: If PXFS vnode, call PXFS function. 2272 */ 2273 if (error == 0) { 2274 if (bufsize == 0) { 2275 clear_active_fd(fd); 2276 aio_zerolen(reqp); 2277 return (0); 2278 } 2279 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED()); 2280 } 2281 2282 /* 2283 * the fd is stored in the aio_req_t by aio_req_setup(), and 2284 * is released by the aio_cleanup_thread() when the IO has 2285 * completed. 2286 */ 2287 if (error) { 2288 releasef(fd); 2289 mutex_enter(&aiop->aio_mutex); 2290 aio_deq_port_pending(aiop, reqp); 2291 aio_req_free(aiop, reqp); 2292 aiop->aio_pending--; 2293 if (aiop->aio_flags & AIO_REQ_BLOCK) 2294 cv_signal(&aiop->aio_cleanupcv); 2295 mutex_exit(&aiop->aio_mutex); 2296 return (error); 2297 } 2298 clear_active_fd(fd); 2299 return (0); 2300 } 2301 2302 2303 /* 2304 * set error for a list IO entry that failed. 2305 */ 2306 static void 2307 lio_set_error(aio_req_t *reqp) 2308 { 2309 aio_t *aiop = curproc->p_aio; 2310 2311 if (aiop == NULL) 2312 return; 2313 2314 mutex_enter(&aiop->aio_mutex); 2315 aio_deq_port_pending(aiop, reqp); 2316 aiop->aio_pending--; 2317 /* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */ 2318 reqp->aio_req_flags |= AIO_PHYSIODONE; 2319 /* 2320 * Need to free the request now as its never 2321 * going to get on the done queue 2322 * 2323 * Note: aio_outstanding is decremented in 2324 * aio_req_free() 2325 */ 2326 aio_req_free(aiop, reqp); 2327 if (aiop->aio_flags & AIO_REQ_BLOCK) 2328 cv_signal(&aiop->aio_cleanupcv); 2329 mutex_exit(&aiop->aio_mutex); 2330 } 2331 2332 /* 2333 * check if a specified request is done, and remove it from 2334 * the done queue. otherwise remove anybody from the done queue 2335 * if NULL is specified. 2336 */ 2337 static aio_req_t * 2338 aio_req_done(void *resultp) 2339 { 2340 aio_req_t **bucket; 2341 aio_req_t *ent; 2342 aio_t *aiop = curproc->p_aio; 2343 long index; 2344 2345 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex)); 2346 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2347 2348 if (resultp) { 2349 index = AIO_HASH(resultp); 2350 bucket = &aiop->aio_hash[index]; 2351 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 2352 if (ent->aio_req_resultp == (aio_result_t *)resultp) { 2353 if (ent->aio_req_flags & AIO_DONEQ) { 2354 return (aio_req_remove(ent)); 2355 } 2356 return (NULL); 2357 } 2358 } 2359 /* no match, resultp is invalid */ 2360 return (NULL); 2361 } 2362 return (aio_req_remove(NULL)); 2363 } 2364 2365 /* 2366 * determine if a user-level resultp pointer is associated with an 2367 * active IO request. Zero is returned when the request is done, 2368 * and the request is removed from the done queue. Only when the 2369 * return value is zero, is the "reqp" pointer valid. One is returned 2370 * when the request is inprogress. Two is returned when the request 2371 * is invalid. 2372 */ 2373 static int 2374 aio_req_find(aio_result_t *resultp, aio_req_t **reqp) 2375 { 2376 aio_req_t **bucket; 2377 aio_req_t *ent; 2378 aio_t *aiop = curproc->p_aio; 2379 long index; 2380 2381 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex)); 2382 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2383 2384 index = AIO_HASH(resultp); 2385 bucket = &aiop->aio_hash[index]; 2386 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 2387 if (ent->aio_req_resultp == resultp) { 2388 if (ent->aio_req_flags & AIO_DONEQ) { 2389 *reqp = aio_req_remove(ent); 2390 return (0); 2391 } 2392 return (1); 2393 } 2394 } 2395 /* no match, resultp is invalid */ 2396 return (2); 2397 } 2398 2399 /* 2400 * remove a request from the done queue. 2401 */ 2402 static aio_req_t * 2403 aio_req_remove(aio_req_t *reqp) 2404 { 2405 aio_t *aiop = curproc->p_aio; 2406 aio_req_t *head; 2407 2408 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2409 2410 if (reqp) { 2411 ASSERT(reqp->aio_req_flags & AIO_DONEQ); 2412 if (reqp->aio_req_next == reqp) { 2413 /* only one request on queue */ 2414 if (reqp == aiop->aio_doneq) { 2415 aiop->aio_doneq = NULL; 2416 } else { 2417 ASSERT(reqp == aiop->aio_cleanupq); 2418 aiop->aio_cleanupq = NULL; 2419 } 2420 } else { 2421 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev; 2422 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next; 2423 /* 2424 * The request can be either on the aio_doneq or the 2425 * aio_cleanupq 2426 */ 2427 if (reqp == aiop->aio_doneq) 2428 aiop->aio_doneq = reqp->aio_req_next; 2429 2430 if (reqp == aiop->aio_cleanupq) 2431 aiop->aio_cleanupq = reqp->aio_req_next; 2432 } 2433 reqp->aio_req_flags &= ~AIO_DONEQ; 2434 return (reqp); 2435 } 2436 2437 if (aiop->aio_doneq) { 2438 head = aiop->aio_doneq; 2439 ASSERT(head->aio_req_flags & AIO_DONEQ); 2440 if (head == head->aio_req_next) { 2441 /* only one request on queue */ 2442 aiop->aio_doneq = NULL; 2443 } else { 2444 head->aio_req_prev->aio_req_next = head->aio_req_next; 2445 head->aio_req_next->aio_req_prev = head->aio_req_prev; 2446 aiop->aio_doneq = head->aio_req_next; 2447 } 2448 head->aio_req_flags &= ~AIO_DONEQ; 2449 return (head); 2450 } 2451 return (NULL); 2452 } 2453 2454 static int 2455 aio_req_setup( 2456 aio_req_t **reqpp, 2457 aio_t *aiop, 2458 aiocb_t *arg, 2459 aio_result_t *resultp, 2460 int port, 2461 vnode_t *vp) 2462 { 2463 aio_req_t *reqp; 2464 sigqueue_t *sqp; 2465 struct uio *uio; 2466 2467 struct sigevent *sigev; 2468 int error; 2469 2470 sigev = &arg->aio_sigevent; 2471 if ((sigev->sigev_notify == SIGEV_SIGNAL) && 2472 (sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG)) { 2473 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 2474 if (sqp == NULL) 2475 return (EAGAIN); 2476 sqp->sq_func = NULL; 2477 sqp->sq_next = NULL; 2478 sqp->sq_info.si_code = SI_ASYNCIO; 2479 sqp->sq_info.si_pid = curproc->p_pid; 2480 sqp->sq_info.si_ctid = PRCTID(curproc); 2481 sqp->sq_info.si_zoneid = getzoneid(); 2482 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 2483 sqp->sq_info.si_signo = sigev->sigev_signo; 2484 sqp->sq_info.si_value = sigev->sigev_value; 2485 } else 2486 sqp = NULL; 2487 2488 mutex_enter(&aiop->aio_mutex); 2489 2490 if (aiop->aio_flags & AIO_REQ_BLOCK) { 2491 mutex_exit(&aiop->aio_mutex); 2492 if (sqp) 2493 kmem_free(sqp, sizeof (sigqueue_t)); 2494 return (EIO); 2495 } 2496 /* 2497 * get an aio_reqp from the free list or allocate one 2498 * from dynamic memory. 2499 */ 2500 if (error = aio_req_alloc(&reqp, resultp)) { 2501 mutex_exit(&aiop->aio_mutex); 2502 if (sqp) 2503 kmem_free(sqp, sizeof (sigqueue_t)); 2504 return (error); 2505 } 2506 aiop->aio_pending++; 2507 aiop->aio_outstanding++; 2508 reqp->aio_req_flags = AIO_PENDING; 2509 if (port) 2510 aio_enq_port_pending(aiop, reqp); 2511 mutex_exit(&aiop->aio_mutex); 2512 /* 2513 * initialize aio request. 2514 */ 2515 reqp->aio_req_fd = arg->aio_fildes; 2516 reqp->aio_req_sigqp = sqp; 2517 reqp->aio_req_iocb.iocb = NULL; 2518 reqp->aio_req_buf.b_file = vp; 2519 uio = reqp->aio_req.aio_uio; 2520 uio->uio_iovcnt = 1; 2521 uio->uio_iov->iov_base = (caddr_t)arg->aio_buf; 2522 uio->uio_iov->iov_len = arg->aio_nbytes; 2523 uio->uio_loffset = arg->aio_offset; 2524 *reqpp = reqp; 2525 return (0); 2526 } 2527 2528 /* 2529 * Allocate p_aio struct. 2530 */ 2531 static aio_t * 2532 aio_aiop_alloc(void) 2533 { 2534 aio_t *aiop; 2535 2536 ASSERT(MUTEX_HELD(&curproc->p_lock)); 2537 2538 aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP); 2539 if (aiop) { 2540 mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL); 2541 mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT, 2542 NULL); 2543 mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL); 2544 } 2545 return (aiop); 2546 } 2547 2548 /* 2549 * Allocate an aio_req struct. 2550 */ 2551 static int 2552 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp) 2553 { 2554 aio_req_t *reqp; 2555 aio_t *aiop = curproc->p_aio; 2556 2557 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2558 2559 if ((reqp = aiop->aio_free) != NULL) { 2560 reqp->aio_req_flags = 0; 2561 aiop->aio_free = reqp->aio_req_next; 2562 /* 2563 * Clustering:This field has to be specifically 2564 * set to null so that the right thing can be 2565 * done in aphysio() 2566 */ 2567 reqp->aio_req_buf.b_iodone = NULL; 2568 } else { 2569 /* 2570 * Check whether memory is getting tight. 2571 * This is a temporary mechanism to avoid memory 2572 * exhaustion by a single process until we come up 2573 * with a per process solution such as setrlimit(). 2574 */ 2575 if (freemem < desfree) 2576 return (EAGAIN); 2577 2578 reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP); 2579 if (reqp == NULL) 2580 return (EAGAIN); 2581 reqp->aio_req.aio_uio = &(reqp->aio_req_uio); 2582 reqp->aio_req.aio_uio->uio_iov = &(reqp->aio_req_iov); 2583 reqp->aio_req.aio_private = reqp; 2584 } 2585 2586 reqp->aio_req_buf.b_offset = -1; 2587 reqp->aio_req_resultp = resultp; 2588 if (aio_hash_insert(reqp, aiop)) { 2589 reqp->aio_req_next = aiop->aio_free; 2590 aiop->aio_free = reqp; 2591 return (EINVAL); 2592 } 2593 *nreqp = reqp; 2594 return (0); 2595 } 2596 2597 /* 2598 * Allocate an aio_lio_t struct. 2599 */ 2600 static int 2601 aio_lio_alloc(aio_lio_t **head) 2602 { 2603 aio_lio_t *liop; 2604 aio_t *aiop = curproc->p_aio; 2605 2606 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2607 2608 if ((liop = aiop->aio_lio_free) != NULL) { 2609 aiop->aio_lio_free = liop->lio_next; 2610 } else { 2611 /* 2612 * Check whether memory is getting tight. 2613 * This is a temporary mechanism to avoid memory 2614 * exhaustion by a single process until we come up 2615 * with a per process solution such as setrlimit(). 2616 */ 2617 if (freemem < desfree) 2618 return (EAGAIN); 2619 2620 liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP); 2621 if (liop == NULL) 2622 return (EAGAIN); 2623 } 2624 *head = liop; 2625 return (0); 2626 } 2627 2628 /* 2629 * this is a special per-process thread that is only activated if 2630 * the process is unmapping a segment with outstanding aio. normally, 2631 * the process will have completed the aio before unmapping the 2632 * segment. If the process does unmap a segment with outstanding aio, 2633 * this special thread will guarentee that the locked pages due to 2634 * aphysio() are released, thereby permitting the segment to be 2635 * unmapped. In addition to this, the cleanup thread is woken up 2636 * during DR operations to release the locked pages. 2637 */ 2638 2639 static int 2640 aio_cleanup_thread(aio_t *aiop) 2641 { 2642 proc_t *p = curproc; 2643 struct as *as = p->p_as; 2644 int poked = 0; 2645 kcondvar_t *cvp; 2646 int exit_flag = 0; 2647 int rqclnup = 0; 2648 2649 sigfillset(&curthread->t_hold); 2650 sigdiffset(&curthread->t_hold, &cantmask); 2651 for (;;) { 2652 /* 2653 * if a segment is being unmapped, and the current 2654 * process's done queue is not empty, then every request 2655 * on the doneq with locked resources should be forced 2656 * to release their locks. By moving the doneq request 2657 * to the cleanupq, aio_cleanup() will process the cleanupq, 2658 * and place requests back onto the doneq. All requests 2659 * processed by aio_cleanup() will have their physical 2660 * resources unlocked. 2661 */ 2662 mutex_enter(&aiop->aio_mutex); 2663 if ((aiop->aio_flags & AIO_CLEANUP) == 0) { 2664 aiop->aio_flags |= AIO_CLEANUP; 2665 mutex_enter(&as->a_contents); 2666 if (aiop->aio_rqclnup) { 2667 aiop->aio_rqclnup = 0; 2668 rqclnup = 1; 2669 } 2670 2671 if ((rqclnup || AS_ISUNMAPWAIT(as)) && 2672 aiop->aio_doneq) { 2673 aio_req_t *doneqhead = aiop->aio_doneq; 2674 mutex_exit(&as->a_contents); 2675 aiop->aio_doneq = NULL; 2676 aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ); 2677 } else { 2678 mutex_exit(&as->a_contents); 2679 } 2680 } 2681 mutex_exit(&aiop->aio_mutex); 2682 aio_cleanup(AIO_CLEANUP_THREAD); 2683 /* 2684 * thread should block on the cleanupcv while 2685 * AIO_CLEANUP is set. 2686 */ 2687 cvp = &aiop->aio_cleanupcv; 2688 mutex_enter(&aiop->aio_mutex); 2689 2690 if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL || 2691 aiop->aio_notifyq != NULL || 2692 aiop->aio_portcleanupq != NULL) { 2693 mutex_exit(&aiop->aio_mutex); 2694 continue; 2695 } 2696 mutex_enter(&as->a_contents); 2697 2698 /* 2699 * AIO_CLEANUP determines when the cleanup thread 2700 * should be active. This flag is set when 2701 * the cleanup thread is awakened by as_unmap() or 2702 * due to DR operations. 2703 * The flag is cleared when the blocking as_unmap() 2704 * that originally awakened us is allowed to 2705 * complete. as_unmap() blocks when trying to 2706 * unmap a segment that has SOFTLOCKed pages. when 2707 * the segment's pages are all SOFTUNLOCKed, 2708 * as->a_flags & AS_UNMAPWAIT should be zero. 2709 * 2710 * In case of cleanup request by DR, the flag is cleared 2711 * once all the pending aio requests have been processed. 2712 * 2713 * The flag shouldn't be cleared right away if the 2714 * cleanup thread was interrupted because the process 2715 * is doing forkall(). This happens when cv_wait_sig() 2716 * returns zero, because it was awakened by a pokelwps(). 2717 * If the process is not exiting, it must be doing forkall(). 2718 */ 2719 if ((poked == 0) && 2720 ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) || 2721 (aiop->aio_pending == 0))) { 2722 aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT); 2723 cvp = &as->a_cv; 2724 rqclnup = 0; 2725 } 2726 mutex_exit(&aiop->aio_mutex); 2727 if (poked) { 2728 /* 2729 * If the process is exiting/killed, don't return 2730 * immediately without waiting for pending I/O's 2731 * and releasing the page locks. 2732 */ 2733 if (p->p_flag & (SEXITLWPS|SKILLED)) { 2734 /* 2735 * If exit_flag is set, then it is 2736 * safe to exit because we have released 2737 * page locks of completed I/O's. 2738 */ 2739 if (exit_flag) 2740 break; 2741 2742 mutex_exit(&as->a_contents); 2743 2744 /* 2745 * Wait for all the pending aio to complete. 2746 */ 2747 mutex_enter(&aiop->aio_mutex); 2748 aiop->aio_flags |= AIO_REQ_BLOCK; 2749 while (aiop->aio_pending != 0) 2750 cv_wait(&aiop->aio_cleanupcv, 2751 &aiop->aio_mutex); 2752 mutex_exit(&aiop->aio_mutex); 2753 exit_flag = 1; 2754 continue; 2755 } else if (p->p_flag & 2756 (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) { 2757 /* 2758 * hold LWP until it 2759 * is continued. 2760 */ 2761 mutex_exit(&as->a_contents); 2762 mutex_enter(&p->p_lock); 2763 stop(PR_SUSPENDED, SUSPEND_NORMAL); 2764 mutex_exit(&p->p_lock); 2765 poked = 0; 2766 continue; 2767 } 2768 } else { 2769 /* 2770 * When started this thread will sleep on as->a_cv. 2771 * as_unmap will awake this thread if the 2772 * segment has SOFTLOCKed pages (poked = 0). 2773 * 1. pokelwps() awakes this thread => 2774 * break the loop to check SEXITLWPS, SHOLDFORK, etc 2775 * 2. as_unmap awakes this thread => 2776 * to break the loop it is necessary that 2777 * - AS_UNMAPWAIT is set (as_unmap is waiting for 2778 * memory to be unlocked) 2779 * - AIO_CLEANUP is not set 2780 * (if AIO_CLEANUP is set we have to wait for 2781 * pending requests. aio_done will send a signal 2782 * for every request which completes to continue 2783 * unmapping the corresponding address range) 2784 * 3. A cleanup request will wake this thread up, ex. 2785 * by the DR operations. The aio_rqclnup flag will 2786 * be set. 2787 */ 2788 while (poked == 0) { 2789 /* 2790 * we need to handle cleanup requests 2791 * that come in after we had just cleaned up, 2792 * so that we do cleanup of any new aio 2793 * requests that got completed and have 2794 * locked resources. 2795 */ 2796 if ((aiop->aio_rqclnup || 2797 (AS_ISUNMAPWAIT(as) != 0)) && 2798 (aiop->aio_flags & AIO_CLEANUP) == 0) 2799 break; 2800 poked = !cv_wait_sig(cvp, &as->a_contents); 2801 if (AS_ISUNMAPWAIT(as) == 0) 2802 cv_signal(cvp); 2803 if (aiop->aio_outstanding != 0) 2804 break; 2805 } 2806 } 2807 mutex_exit(&as->a_contents); 2808 } 2809 exit: 2810 mutex_exit(&as->a_contents); 2811 ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED))); 2812 aston(curthread); /* make thread do post_syscall */ 2813 return (0); 2814 } 2815 2816 /* 2817 * save a reference to a user's outstanding aio in a hash list. 2818 */ 2819 static int 2820 aio_hash_insert( 2821 aio_req_t *aio_reqp, 2822 aio_t *aiop) 2823 { 2824 long index; 2825 aio_result_t *resultp = aio_reqp->aio_req_resultp; 2826 aio_req_t *current; 2827 aio_req_t **nextp; 2828 2829 index = AIO_HASH(resultp); 2830 nextp = &aiop->aio_hash[index]; 2831 while ((current = *nextp) != NULL) { 2832 if (current->aio_req_resultp == resultp) 2833 return (DUPLICATE); 2834 nextp = ¤t->aio_hash_next; 2835 } 2836 *nextp = aio_reqp; 2837 aio_reqp->aio_hash_next = NULL; 2838 return (0); 2839 } 2840 2841 static int 2842 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *, 2843 cred_t *) 2844 { 2845 struct snode *sp; 2846 dev_t dev; 2847 struct cb_ops *cb; 2848 major_t major; 2849 int (*aio_func)(); 2850 2851 dev = vp->v_rdev; 2852 major = getmajor(dev); 2853 2854 /* 2855 * return NULL for requests to files and STREAMs so 2856 * that libaio takes care of them. 2857 */ 2858 if (vp->v_type == VCHR) { 2859 /* no stream device for kaio */ 2860 if (STREAMSTAB(major)) { 2861 return (NULL); 2862 } 2863 } else { 2864 return (NULL); 2865 } 2866 2867 /* 2868 * Check old drivers which do not have async I/O entry points. 2869 */ 2870 if (devopsp[major]->devo_rev < 3) 2871 return (NULL); 2872 2873 cb = devopsp[major]->devo_cb_ops; 2874 2875 if (cb->cb_rev < 1) 2876 return (NULL); 2877 2878 /* 2879 * Check whether this device is a block device. 2880 * Kaio is not supported for devices like tty. 2881 */ 2882 if (cb->cb_strategy == nodev || cb->cb_strategy == NULL) 2883 return (NULL); 2884 2885 /* 2886 * Clustering: If vnode is a PXFS vnode, then the device may be remote. 2887 * We cannot call the driver directly. Instead return the 2888 * PXFS functions. 2889 */ 2890 2891 if (IS_PXFSVP(vp)) { 2892 if (mode & FREAD) 2893 return (clpxfs_aio_read); 2894 else 2895 return (clpxfs_aio_write); 2896 } 2897 if (mode & FREAD) 2898 aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read; 2899 else 2900 aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write; 2901 2902 /* 2903 * Do we need this ? 2904 * nodev returns ENXIO anyway. 2905 */ 2906 if (aio_func == nodev) 2907 return (NULL); 2908 2909 sp = VTOS(vp); 2910 smark(sp, SACC); 2911 return (aio_func); 2912 } 2913 2914 /* 2915 * Clustering: We want check_vp to return a function prototyped 2916 * correctly that will be common to both PXFS and regular case. 2917 * We define this intermediate function that will do the right 2918 * thing for driver cases. 2919 */ 2920 2921 static int 2922 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p) 2923 { 2924 dev_t dev; 2925 struct cb_ops *cb; 2926 2927 ASSERT(vp->v_type == VCHR); 2928 ASSERT(!IS_PXFSVP(vp)); 2929 dev = VTOS(vp)->s_dev; 2930 ASSERT(STREAMSTAB(getmajor(dev)) == NULL); 2931 2932 cb = devopsp[getmajor(dev)]->devo_cb_ops; 2933 2934 ASSERT(cb->cb_awrite != nodev); 2935 return ((*cb->cb_awrite)(dev, aio, cred_p)); 2936 } 2937 2938 /* 2939 * Clustering: We want check_vp to return a function prototyped 2940 * correctly that will be common to both PXFS and regular case. 2941 * We define this intermediate function that will do the right 2942 * thing for driver cases. 2943 */ 2944 2945 static int 2946 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p) 2947 { 2948 dev_t dev; 2949 struct cb_ops *cb; 2950 2951 ASSERT(vp->v_type == VCHR); 2952 ASSERT(!IS_PXFSVP(vp)); 2953 dev = VTOS(vp)->s_dev; 2954 ASSERT(!STREAMSTAB(getmajor(dev))); 2955 2956 cb = devopsp[getmajor(dev)]->devo_cb_ops; 2957 2958 ASSERT(cb->cb_aread != nodev); 2959 return ((*cb->cb_aread)(dev, aio, cred_p)); 2960 } 2961 2962 /* 2963 * This routine is called when a largefile call is made by a 32bit 2964 * process on a ILP32 or LP64 kernel. All 64bit processes are large 2965 * file by definition and will call alio() instead. 2966 */ 2967 static int 2968 alioLF( 2969 int mode_arg, 2970 void *aiocb_arg, 2971 int nent, 2972 void *sigev) 2973 { 2974 file_t *fp; 2975 file_t *prev_fp = NULL; 2976 int prev_mode = -1; 2977 struct vnode *vp; 2978 aio_lio_t *head; 2979 aio_req_t *reqp; 2980 aio_t *aiop; 2981 caddr_t cbplist; 2982 aiocb64_32_t *cbp; 2983 caddr32_t *ucbp; 2984 aiocb64_32_t cb64; 2985 aiocb64_32_t *aiocb = &cb64; 2986 #ifdef _LP64 2987 aiocb_t aiocb_n; 2988 #endif 2989 struct sigevent32 sigevk; 2990 sigqueue_t *sqp; 2991 int (*aio_func)(); 2992 int mode; 2993 int error = 0, aio_errors = 0; 2994 int i; 2995 size_t ssize; 2996 int deadhead = 0; 2997 int aio_notsupported = 0; 2998 int aio_use_port = 0; 2999 port_kevent_t *pkevtp = NULL; 3000 port_notify32_t pnotify; 3001 3002 aiop = curproc->p_aio; 3003 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 3004 return (EINVAL); 3005 3006 ASSERT(get_udatamodel() == DATAMODEL_ILP32); 3007 3008 ssize = (sizeof (caddr32_t) * nent); 3009 cbplist = kmem_alloc(ssize, KM_SLEEP); 3010 ucbp = (caddr32_t *)cbplist; 3011 3012 if (copyin(aiocb_arg, cbplist, ssize)) { 3013 kmem_free(cbplist, ssize); 3014 return (EFAULT); 3015 } 3016 3017 if (sigev) { 3018 if (copyin(sigev, &sigevk, sizeof (sigevk))) { 3019 kmem_free(cbplist, ssize); 3020 return (EFAULT); 3021 } 3022 } 3023 3024 /* 3025 * a list head should be allocated if notification is 3026 * enabled for this list. 3027 */ 3028 head = NULL; 3029 3030 /* Event Ports */ 3031 3032 if (sigev && sigevk.sigev_notify == SIGEV_PORT) { 3033 /* Use PORT for completion notification */ 3034 if (copyin((void *)(uintptr_t)sigevk.sigev_value.sival_ptr, 3035 &pnotify, sizeof (port_notify32_t))) { 3036 kmem_free(cbplist, ssize); 3037 return (EFAULT); 3038 } 3039 /* use event ports for the list of aiocbs */ 3040 aio_use_port = 1; 3041 error = port_alloc_event(pnotify.portnfy_port, 3042 PORT_ALLOC_PRIVATE, PORT_SOURCE_AIO, &pkevtp); 3043 if (error) { 3044 if (error == ENOMEM) 3045 error = EAGAIN; 3046 kmem_free(cbplist, ssize); 3047 return (error); 3048 } 3049 } else if ((mode_arg == LIO_WAIT) || sigev) { 3050 mutex_enter(&aiop->aio_mutex); 3051 error = aio_lio_alloc(&head); 3052 mutex_exit(&aiop->aio_mutex); 3053 if (error) 3054 goto done; 3055 deadhead = 1; 3056 head->lio_nent = nent; 3057 head->lio_refcnt = nent; 3058 if (sigev && (sigevk.sigev_notify == SIGEV_SIGNAL) && 3059 (sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG)) { 3060 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3061 if (sqp == NULL) { 3062 error = EAGAIN; 3063 goto done; 3064 } 3065 sqp->sq_func = NULL; 3066 sqp->sq_next = NULL; 3067 sqp->sq_info.si_code = SI_ASYNCIO; 3068 sqp->sq_info.si_pid = curproc->p_pid; 3069 sqp->sq_info.si_ctid = PRCTID(curproc); 3070 sqp->sq_info.si_zoneid = getzoneid(); 3071 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3072 sqp->sq_info.si_signo = sigevk.sigev_signo; 3073 sqp->sq_info.si_value.sival_int = 3074 sigevk.sigev_value.sival_int; 3075 head->lio_sigqp = sqp; 3076 } else { 3077 head->lio_sigqp = NULL; 3078 } 3079 } 3080 3081 for (i = 0; i < nent; i++, ucbp++) { 3082 3083 cbp = (aiocb64_32_t *)(uintptr_t)*ucbp; 3084 /* skip entry if it can't be copied. */ 3085 if (cbp == NULL || copyin(cbp, aiocb, sizeof (aiocb64_32_t))) { 3086 if (head) { 3087 mutex_enter(&aiop->aio_mutex); 3088 head->lio_nent--; 3089 head->lio_refcnt--; 3090 mutex_exit(&aiop->aio_mutex); 3091 } 3092 continue; 3093 } 3094 3095 /* skip if opcode for aiocb is LIO_NOP */ 3096 3097 mode = aiocb->aio_lio_opcode; 3098 if (mode == LIO_NOP) { 3099 cbp = NULL; 3100 if (head) { 3101 mutex_enter(&aiop->aio_mutex); 3102 head->lio_nent--; 3103 head->lio_refcnt--; 3104 mutex_exit(&aiop->aio_mutex); 3105 } 3106 continue; 3107 } 3108 3109 /* increment file descriptor's ref count. */ 3110 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 3111 lio_set_uerror(&cbp->aio_resultp, EBADF); 3112 if (head) { 3113 mutex_enter(&aiop->aio_mutex); 3114 head->lio_nent--; 3115 head->lio_refcnt--; 3116 mutex_exit(&aiop->aio_mutex); 3117 } 3118 aio_errors++; 3119 continue; 3120 } 3121 3122 vp = fp->f_vnode; 3123 3124 /* 3125 * check the permission of the partition 3126 */ 3127 mode = aiocb->aio_lio_opcode; 3128 if ((fp->f_flag & mode) == 0) { 3129 releasef(aiocb->aio_fildes); 3130 lio_set_uerror(&cbp->aio_resultp, EBADF); 3131 if (head) { 3132 mutex_enter(&aiop->aio_mutex); 3133 head->lio_nent--; 3134 head->lio_refcnt--; 3135 mutex_exit(&aiop->aio_mutex); 3136 } 3137 aio_errors++; 3138 continue; 3139 } 3140 3141 /* 3142 * common case where requests are to the same fd 3143 * for the same r/w operation 3144 * for UFS, need to set EBADFD 3145 */ 3146 if ((fp != prev_fp) || (mode != prev_mode)) { 3147 aio_func = check_vp(vp, mode); 3148 if (aio_func == NULL) { 3149 prev_fp = NULL; 3150 releasef(aiocb->aio_fildes); 3151 lio_set_uerror(&cbp->aio_resultp, EBADFD); 3152 aio_notsupported++; 3153 if (head) { 3154 mutex_enter(&aiop->aio_mutex); 3155 head->lio_nent--; 3156 head->lio_refcnt--; 3157 mutex_exit(&aiop->aio_mutex); 3158 } 3159 continue; 3160 } else { 3161 prev_fp = fp; 3162 prev_mode = mode; 3163 } 3164 } 3165 #ifdef _LP64 3166 aiocb_LFton(aiocb, &aiocb_n); 3167 error = aio_req_setup(&reqp, aiop, &aiocb_n, 3168 (aio_result_t *)&cbp->aio_resultp, aio_use_port, vp); 3169 #else 3170 error = aio_req_setupLF(&reqp, aiop, aiocb, 3171 (aio_result_t *)&cbp->aio_resultp, aio_use_port, vp); 3172 #endif /* _LP64 */ 3173 if (error) { 3174 releasef(aiocb->aio_fildes); 3175 if (head) { 3176 mutex_enter(&aiop->aio_mutex); 3177 head->lio_nent--; 3178 head->lio_refcnt--; 3179 mutex_exit(&aiop->aio_mutex); 3180 } 3181 aio_errors++; 3182 continue; 3183 } 3184 3185 reqp->aio_req_lio = head; 3186 deadhead = 0; 3187 3188 /* 3189 * Set the errno field now before sending the request to 3190 * the driver to avoid a race condition 3191 */ 3192 (void) suword32(&cbp->aio_resultp.aio_errno, 3193 EINPROGRESS); 3194 3195 reqp->aio_req_iocb.iocb32 = *ucbp; 3196 3197 if (aio_use_port) { 3198 reqp->aio_req_port = pnotify.portnfy_port; 3199 error = aio_req_assoc_port32(&aiocb->aio_sigevent, 3200 (void *)(uintptr_t)pnotify.portnfy_user, 3201 (aiocb_t *)(uintptr_t)*ucbp, reqp, pkevtp); 3202 } 3203 3204 /* 3205 * send the request to driver. 3206 * Clustering: If PXFS vnode, call PXFS function. 3207 */ 3208 if (error == 0) { 3209 if (aiocb->aio_nbytes == 0) { 3210 clear_active_fd(aiocb->aio_fildes); 3211 aio_zerolen(reqp); 3212 continue; 3213 } 3214 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 3215 CRED()); 3216 } 3217 3218 /* 3219 * the fd's ref count is not decremented until the IO has 3220 * completed unless there was an error. 3221 */ 3222 if (error) { 3223 releasef(aiocb->aio_fildes); 3224 lio_set_uerror(&cbp->aio_resultp, error); 3225 if (head) { 3226 mutex_enter(&aiop->aio_mutex); 3227 head->lio_nent--; 3228 head->lio_refcnt--; 3229 mutex_exit(&aiop->aio_mutex); 3230 } 3231 if (error == ENOTSUP) 3232 aio_notsupported++; 3233 else 3234 aio_errors++; 3235 lio_set_error(reqp); 3236 } else { 3237 clear_active_fd(aiocb->aio_fildes); 3238 } 3239 } 3240 3241 if (pkevtp) 3242 port_free_event(pkevtp); 3243 3244 if (aio_notsupported) { 3245 error = ENOTSUP; 3246 } else if (aio_errors) { 3247 /* 3248 * return EIO if any request failed 3249 */ 3250 error = EIO; 3251 } 3252 3253 if (mode_arg == LIO_WAIT) { 3254 mutex_enter(&aiop->aio_mutex); 3255 while (head->lio_refcnt > 0) { 3256 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 3257 mutex_exit(&aiop->aio_mutex); 3258 error = EINTR; 3259 goto done; 3260 } 3261 } 3262 mutex_exit(&aiop->aio_mutex); 3263 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE); 3264 } 3265 3266 done: 3267 kmem_free(cbplist, ssize); 3268 if (deadhead) { 3269 if (head->lio_sigqp) 3270 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 3271 kmem_free(head, sizeof (aio_lio_t)); 3272 } 3273 return (error); 3274 } 3275 3276 #ifdef _SYSCALL32_IMPL 3277 static void 3278 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest) 3279 { 3280 dest->aio_fildes = src->aio_fildes; 3281 dest->aio_buf = (void *)(uintptr_t)src->aio_buf; 3282 dest->aio_nbytes = (size_t)src->aio_nbytes; 3283 dest->aio_offset = (off_t)src->aio_offset; 3284 dest->aio_reqprio = src->aio_reqprio; 3285 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify; 3286 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo; 3287 3288 /* 3289 * See comment in sigqueue32() on handling of 32-bit 3290 * sigvals in a 64-bit kernel. 3291 */ 3292 dest->aio_sigevent.sigev_value.sival_int = 3293 (int)src->aio_sigevent.sigev_value.sival_int; 3294 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval)) 3295 (uintptr_t)src->aio_sigevent.sigev_notify_function; 3296 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *) 3297 (uintptr_t)src->aio_sigevent.sigev_notify_attributes; 3298 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2; 3299 dest->aio_lio_opcode = src->aio_lio_opcode; 3300 dest->aio_state = src->aio_state; 3301 dest->aio__pad[0] = src->aio__pad[0]; 3302 } 3303 #endif 3304 3305 /* 3306 * This function is used only for largefile calls made by 3307 * 32 bit applications on 32 bit kernel. 3308 */ 3309 static int 3310 aio_req_setupLF( 3311 aio_req_t **reqpp, 3312 aio_t *aiop, 3313 aiocb64_32_t *arg, 3314 aio_result_t *resultp, 3315 int port, 3316 vnode_t *vp) 3317 { 3318 aio_req_t *reqp; 3319 sigqueue_t *sqp; 3320 struct uio *uio; 3321 3322 struct sigevent *sigev; 3323 int error; 3324 3325 sigev = (struct sigevent *)&arg->aio_sigevent; 3326 if ((sigev->sigev_notify == SIGEV_SIGNAL) && 3327 (sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG)) { 3328 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3329 if (sqp == NULL) 3330 return (EAGAIN); 3331 sqp->sq_func = NULL; 3332 sqp->sq_next = NULL; 3333 sqp->sq_info.si_code = SI_ASYNCIO; 3334 sqp->sq_info.si_pid = curproc->p_pid; 3335 sqp->sq_info.si_ctid = PRCTID(curproc); 3336 sqp->sq_info.si_zoneid = getzoneid(); 3337 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3338 sqp->sq_info.si_signo = sigev->sigev_signo; 3339 sqp->sq_info.si_value = sigev->sigev_value; 3340 } else 3341 sqp = NULL; 3342 3343 mutex_enter(&aiop->aio_mutex); 3344 3345 if (aiop->aio_flags & AIO_REQ_BLOCK) { 3346 mutex_exit(&aiop->aio_mutex); 3347 if (sqp) 3348 kmem_free(sqp, sizeof (sigqueue_t)); 3349 return (EIO); 3350 } 3351 /* 3352 * get an aio_reqp from the free list or allocate one 3353 * from dynamic memory. 3354 */ 3355 if (error = aio_req_alloc(&reqp, resultp)) { 3356 mutex_exit(&aiop->aio_mutex); 3357 if (sqp) 3358 kmem_free(sqp, sizeof (sigqueue_t)); 3359 return (error); 3360 } 3361 aiop->aio_pending++; 3362 aiop->aio_outstanding++; 3363 reqp->aio_req_flags = AIO_PENDING; 3364 if (port) 3365 aio_enq_port_pending(aiop, reqp); 3366 mutex_exit(&aiop->aio_mutex); 3367 /* 3368 * initialize aio request. 3369 */ 3370 reqp->aio_req_fd = arg->aio_fildes; 3371 reqp->aio_req_sigqp = sqp; 3372 reqp->aio_req_iocb.iocb = NULL; 3373 reqp->aio_req_buf.b_file = vp; 3374 uio = reqp->aio_req.aio_uio; 3375 uio->uio_iovcnt = 1; 3376 uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf; 3377 uio->uio_iov->iov_len = arg->aio_nbytes; 3378 uio->uio_loffset = arg->aio_offset; 3379 *reqpp = reqp; 3380 return (0); 3381 } 3382 3383 /* 3384 * This routine is called when a non largefile call is made by a 32bit 3385 * process on a ILP32 or LP64 kernel. 3386 */ 3387 static int 3388 alio32( 3389 int mode_arg, 3390 void *aiocb_arg, 3391 int nent, 3392 void *sigev_arg) 3393 { 3394 file_t *fp; 3395 file_t *prev_fp = NULL; 3396 int prev_mode = -1; 3397 struct vnode *vp; 3398 aio_lio_t *head; 3399 aio_req_t *reqp; 3400 aio_t *aiop; 3401 aiocb_t cb; 3402 aiocb_t *aiocb = &cb; 3403 caddr_t cbplist; 3404 #ifdef _LP64 3405 aiocb32_t *cbp; 3406 caddr32_t *ucbp; 3407 aiocb32_t cb32; 3408 aiocb32_t *aiocb32 = &cb32; 3409 struct sigevent32 sigev; 3410 #else 3411 aiocb_t *cbp, **ucbp; 3412 struct sigevent sigev; 3413 #endif 3414 sigqueue_t *sqp; 3415 int (*aio_func)(); 3416 int mode; 3417 int error = 0, aio_errors = 0; 3418 int i; 3419 size_t ssize; 3420 int deadhead = 0; 3421 int aio_notsupported = 0; 3422 int aio_use_port = 0; 3423 port_kevent_t *pkevtp = NULL; 3424 #ifdef _LP64 3425 port_notify32_t pnotify; 3426 #else 3427 port_notify_t pnotify; 3428 #endif 3429 aiop = curproc->p_aio; 3430 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 3431 return (EINVAL); 3432 3433 #ifdef _LP64 3434 ssize = (sizeof (caddr32_t) * nent); 3435 #else 3436 ssize = (sizeof (aiocb_t *) * nent); 3437 #endif 3438 cbplist = kmem_alloc(ssize, KM_SLEEP); 3439 ucbp = (void *)cbplist; 3440 3441 if (copyin(aiocb_arg, cbplist, ssize)) { 3442 kmem_free(cbplist, ssize); 3443 return (EFAULT); 3444 } 3445 3446 if (sigev_arg) { 3447 if (copyin(sigev_arg, &sigev, sizeof (struct sigevent32))) { 3448 kmem_free(cbplist, ssize); 3449 return (EFAULT); 3450 } 3451 } 3452 3453 /* 3454 * a list head should be allocated if notification is 3455 * enabled for this list. 3456 */ 3457 head = NULL; 3458 3459 /* Event Ports */ 3460 3461 if (sigev_arg && sigev.sigev_notify == SIGEV_PORT) { 3462 /* Use PORT for completion notification */ 3463 if (copyin((void *)(uintptr_t)sigev.sigev_value.sival_ptr, 3464 &pnotify, sizeof (port_notify32_t))) { 3465 kmem_free(cbplist, ssize); 3466 return (EFAULT); 3467 } 3468 /* use event ports for the list of aiocbs */ 3469 aio_use_port = 1; 3470 error = port_alloc_event(pnotify.portnfy_port, 3471 PORT_ALLOC_PRIVATE, PORT_SOURCE_AIO, &pkevtp); 3472 if (error) { 3473 if ((error == ENOMEM) || (error == EAGAIN)) 3474 error = EAGAIN; 3475 else 3476 error = EINVAL; 3477 kmem_free(cbplist, ssize); 3478 return (error); 3479 } 3480 } else if ((mode_arg == LIO_WAIT) || sigev_arg) { 3481 mutex_enter(&aiop->aio_mutex); 3482 error = aio_lio_alloc(&head); 3483 mutex_exit(&aiop->aio_mutex); 3484 if (error) 3485 goto done; 3486 deadhead = 1; 3487 head->lio_nent = nent; 3488 head->lio_refcnt = nent; 3489 if (sigev_arg && (sigev.sigev_notify == SIGEV_SIGNAL) && 3490 (sigev.sigev_signo > 0 && sigev.sigev_signo < NSIG)) { 3491 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3492 if (sqp == NULL) { 3493 error = EAGAIN; 3494 goto done; 3495 } 3496 sqp->sq_func = NULL; 3497 sqp->sq_next = NULL; 3498 sqp->sq_info.si_code = SI_ASYNCIO; 3499 sqp->sq_info.si_pid = curproc->p_pid; 3500 sqp->sq_info.si_ctid = PRCTID(curproc); 3501 sqp->sq_info.si_zoneid = getzoneid(); 3502 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3503 sqp->sq_info.si_signo = sigev.sigev_signo; 3504 sqp->sq_info.si_value.sival_int = 3505 sigev.sigev_value.sival_int; 3506 head->lio_sigqp = sqp; 3507 } else { 3508 head->lio_sigqp = NULL; 3509 } 3510 } 3511 3512 for (i = 0; i < nent; i++, ucbp++) { 3513 3514 /* skip entry if it can't be copied. */ 3515 #ifdef _LP64 3516 cbp = (aiocb32_t *)(uintptr_t)*ucbp; 3517 if (cbp == NULL || copyin(cbp, aiocb32, sizeof (aiocb32_t))) { 3518 #else 3519 cbp = (aiocb_t *)*ucbp; 3520 if (cbp == NULL || copyin(cbp, aiocb, sizeof (aiocb_t))) { 3521 #endif 3522 if (head) { 3523 mutex_enter(&aiop->aio_mutex); 3524 head->lio_nent--; 3525 head->lio_refcnt--; 3526 mutex_exit(&aiop->aio_mutex); 3527 } 3528 continue; 3529 } 3530 #ifdef _LP64 3531 /* 3532 * copy 32 bit structure into 64 bit structure 3533 */ 3534 aiocb_32ton(aiocb32, aiocb); 3535 #endif /* _LP64 */ 3536 3537 /* skip if opcode for aiocb is LIO_NOP */ 3538 3539 mode = aiocb->aio_lio_opcode; 3540 if (mode == LIO_NOP) { 3541 cbp = NULL; 3542 if (head) { 3543 mutex_enter(&aiop->aio_mutex); 3544 head->lio_nent--; 3545 head->lio_refcnt--; 3546 mutex_exit(&aiop->aio_mutex); 3547 } 3548 continue; 3549 } 3550 3551 /* increment file descriptor's ref count. */ 3552 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 3553 lio_set_uerror(&cbp->aio_resultp, EBADF); 3554 if (head) { 3555 mutex_enter(&aiop->aio_mutex); 3556 head->lio_nent--; 3557 head->lio_refcnt--; 3558 mutex_exit(&aiop->aio_mutex); 3559 } 3560 aio_errors++; 3561 continue; 3562 } 3563 3564 vp = fp->f_vnode; 3565 3566 /* 3567 * check the permission of the partition 3568 */ 3569 mode = aiocb->aio_lio_opcode; 3570 if ((fp->f_flag & mode) == 0) { 3571 releasef(aiocb->aio_fildes); 3572 lio_set_uerror(&cbp->aio_resultp, EBADF); 3573 if (head) { 3574 mutex_enter(&aiop->aio_mutex); 3575 head->lio_nent--; 3576 head->lio_refcnt--; 3577 mutex_exit(&aiop->aio_mutex); 3578 } 3579 aio_errors++; 3580 continue; 3581 } 3582 3583 /* 3584 * common case where requests are to the same fd 3585 * for the same r/w operation 3586 * for UFS, need to set EBADFD 3587 */ 3588 if ((fp != prev_fp) || (mode != prev_mode)) { 3589 aio_func = check_vp(vp, mode); 3590 if (aio_func == NULL) { 3591 prev_fp = NULL; 3592 releasef(aiocb->aio_fildes); 3593 lio_set_uerror(&cbp->aio_resultp, 3594 EBADFD); 3595 aio_notsupported++; 3596 if (head) { 3597 mutex_enter(&aiop->aio_mutex); 3598 head->lio_nent--; 3599 head->lio_refcnt--; 3600 mutex_exit(&aiop->aio_mutex); 3601 } 3602 continue; 3603 } else { 3604 prev_fp = fp; 3605 prev_mode = mode; 3606 } 3607 } 3608 if (error = aio_req_setup(&reqp, aiop, aiocb, 3609 (aio_result_t *)&cbp->aio_resultp, aio_use_port, vp)) { 3610 releasef(aiocb->aio_fildes); 3611 lio_set_uerror(&cbp->aio_resultp, error); 3612 if (head) { 3613 mutex_enter(&aiop->aio_mutex); 3614 head->lio_nent--; 3615 head->lio_refcnt--; 3616 mutex_exit(&aiop->aio_mutex); 3617 } 3618 aio_errors++; 3619 continue; 3620 } 3621 3622 reqp->aio_req_lio = head; 3623 deadhead = 0; 3624 3625 /* 3626 * Set the errno field now before sending the request to 3627 * the driver to avoid a race condition 3628 */ 3629 (void) suword32(&cbp->aio_resultp.aio_errno, 3630 EINPROGRESS); 3631 3632 reqp->aio_req_iocb.iocb32 = ((caddr32_t *)cbplist)[i]; 3633 3634 if (aio_use_port) { 3635 reqp->aio_req_port = pnotify.portnfy_port; 3636 #ifdef _LP64 3637 error = aio_req_assoc_port32(&aiocb32->aio_sigevent, 3638 (void *)(uintptr_t)pnotify.portnfy_user, 3639 (aiocb_t *)(uintptr_t)(((caddr32_t *)cbplist)[i]), 3640 reqp, pkevtp); 3641 #else 3642 error = aio_req_assoc_port(&aiocb->aio_sigevent, 3643 pnotify.portnfy_user, 3644 (aiocb_t *)(((caddr32_t *)cbplist)[i]), 3645 reqp, pkevtp); 3646 #endif 3647 } 3648 3649 /* 3650 * send the request to driver. 3651 * Clustering: If PXFS vnode, call PXFS function. 3652 */ 3653 if (error == 0) { 3654 if (aiocb->aio_nbytes == 0) { 3655 clear_active_fd(aiocb->aio_fildes); 3656 aio_zerolen(reqp); 3657 continue; 3658 } 3659 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 3660 CRED()); 3661 } 3662 3663 /* 3664 * the fd's ref count is not decremented until the IO has 3665 * completed unless there was an error. 3666 */ 3667 if (error) { 3668 releasef(aiocb->aio_fildes); 3669 lio_set_uerror(&cbp->aio_resultp, error); 3670 if (head) { 3671 mutex_enter(&aiop->aio_mutex); 3672 head->lio_nent--; 3673 head->lio_refcnt--; 3674 mutex_exit(&aiop->aio_mutex); 3675 } 3676 if (error == ENOTSUP) 3677 aio_notsupported++; 3678 else 3679 aio_errors++; 3680 lio_set_error(reqp); 3681 } else { 3682 clear_active_fd(aiocb->aio_fildes); 3683 } 3684 } 3685 3686 if (pkevtp) 3687 port_free_event(pkevtp); 3688 3689 if (aio_notsupported) { 3690 error = ENOTSUP; 3691 } else if (aio_errors) { 3692 /* 3693 * return EIO if any request failed 3694 */ 3695 error = EIO; 3696 } 3697 3698 if (mode_arg == LIO_WAIT) { 3699 mutex_enter(&aiop->aio_mutex); 3700 while (head->lio_refcnt > 0) { 3701 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 3702 mutex_exit(&aiop->aio_mutex); 3703 error = EINTR; 3704 goto done; 3705 } 3706 } 3707 mutex_exit(&aiop->aio_mutex); 3708 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32); 3709 } 3710 3711 done: 3712 kmem_free(cbplist, ssize); 3713 if (deadhead) { 3714 if (head->lio_sigqp) 3715 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 3716 kmem_free(head, sizeof (aio_lio_t)); 3717 } 3718 return (error); 3719 } 3720 3721 3722 #ifdef _SYSCALL32_IMPL 3723 void 3724 aiocb_32ton(aiocb32_t *src, aiocb_t *dest) 3725 { 3726 dest->aio_fildes = src->aio_fildes; 3727 dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf; 3728 dest->aio_nbytes = (size_t)src->aio_nbytes; 3729 dest->aio_offset = (off_t)src->aio_offset; 3730 dest->aio_reqprio = src->aio_reqprio; 3731 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify; 3732 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo; 3733 3734 /* 3735 * See comment in sigqueue32() on handling of 32-bit 3736 * sigvals in a 64-bit kernel. 3737 */ 3738 dest->aio_sigevent.sigev_value.sival_int = 3739 (int)src->aio_sigevent.sigev_value.sival_int; 3740 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval)) 3741 (uintptr_t)src->aio_sigevent.sigev_notify_function; 3742 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *) 3743 (uintptr_t)src->aio_sigevent.sigev_notify_attributes; 3744 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2; 3745 dest->aio_lio_opcode = src->aio_lio_opcode; 3746 dest->aio_state = src->aio_state; 3747 dest->aio__pad[0] = src->aio__pad[0]; 3748 } 3749 #endif /* _SYSCALL32_IMPL */ 3750 3751 /* 3752 * aio_port_callback() is called just before the event is retrieved from the 3753 * port. The task of this callback function is to finish the work of the 3754 * transaction for the application, it means : 3755 * - copyout transaction data to the application 3756 * (this thread is running in the right process context) 3757 * - keep trace of the transaction (update of counters). 3758 * - free allocated buffers 3759 * The aiocb pointer is the object element of the port_kevent_t structure. 3760 * 3761 * flag : 3762 * PORT_CALLBACK_DEFAULT : do copyout and free resources 3763 * PORT_CALLBACK_CLOSE : don't do copyout, free resources 3764 */ 3765 3766 /*ARGSUSED*/ 3767 int 3768 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp) 3769 { 3770 aio_t *aiop = curproc->p_aio; 3771 aio_req_t *reqp = arg; 3772 struct iovec *iov; 3773 struct buf *bp; 3774 void *resultp; 3775 3776 if (pid != curproc->p_pid) { 3777 /* wrong proc !!, can not deliver data here ... */ 3778 return (EACCES); 3779 } 3780 3781 mutex_enter(&aiop->aio_portq_mutex); 3782 reqp->aio_req_portkev = NULL; 3783 aio_req_remove_portq(aiop, reqp); /* remove request from portq */ 3784 mutex_exit(&aiop->aio_portq_mutex); 3785 aphysio_unlock(reqp); /* unlock used pages */ 3786 mutex_enter(&aiop->aio_mutex); 3787 if (reqp->aio_req_flags & AIO_COPYOUTDONE) { 3788 aio_req_free_port(aiop, reqp); /* back to free list */ 3789 mutex_exit(&aiop->aio_mutex); 3790 return (0); 3791 } 3792 3793 iov = reqp->aio_req_uio.uio_iov; 3794 bp = &reqp->aio_req_buf; 3795 resultp = (void *)reqp->aio_req_resultp; 3796 aio_req_free_port(aiop, reqp); /* request struct back to free list */ 3797 mutex_exit(&aiop->aio_mutex); 3798 if (flag == PORT_CALLBACK_DEFAULT) 3799 aio_copyout_result_port(iov, bp, resultp); 3800 return (0); 3801 } 3802