1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2018, Joyent, Inc. 29 */ 30 31 /* 32 * Kernel asynchronous I/O. 33 * This is only for raw devices now (as of Nov. 1993). 34 */ 35 36 #include <sys/types.h> 37 #include <sys/errno.h> 38 #include <sys/conf.h> 39 #include <sys/file.h> 40 #include <sys/fs/snode.h> 41 #include <sys/unistd.h> 42 #include <sys/cmn_err.h> 43 #include <vm/as.h> 44 #include <vm/faultcode.h> 45 #include <sys/sysmacros.h> 46 #include <sys/procfs.h> 47 #include <sys/kmem.h> 48 #include <sys/autoconf.h> 49 #include <sys/ddi_impldefs.h> 50 #include <sys/sunddi.h> 51 #include <sys/aio_impl.h> 52 #include <sys/debug.h> 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/vmsystm.h> 56 #include <sys/fs/pxfs_ki.h> 57 #include <sys/contract/process_impl.h> 58 59 /* 60 * external entry point. 61 */ 62 #ifdef _LP64 63 static int64_t kaioc(long, long, long, long, long, long); 64 #endif 65 static int kaio(ulong_t *, rval_t *); 66 67 68 #define AIO_64 0 69 #define AIO_32 1 70 #define AIO_LARGEFILE 2 71 72 /* 73 * implementation specific functions (private) 74 */ 75 #ifdef _LP64 76 static int alio(int, aiocb_t **, int, struct sigevent *); 77 #endif 78 static int aionotify(void); 79 static int aioinit(void); 80 static int aiostart(void); 81 static void alio_cleanup(aio_t *, aiocb_t **, int, int); 82 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *, 83 cred_t *); 84 static void lio_set_error(aio_req_t *, int portused); 85 static aio_t *aio_aiop_alloc(); 86 static int aio_req_alloc(aio_req_t **, aio_result_t *); 87 static int aio_lio_alloc(aio_lio_t **); 88 static aio_req_t *aio_req_done(void *); 89 static aio_req_t *aio_req_remove(aio_req_t *); 90 static int aio_req_find(aio_result_t *, aio_req_t **); 91 static int aio_hash_insert(struct aio_req_t *, aio_t *); 92 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *, 93 aio_result_t *, vnode_t *, int); 94 static int aio_cleanup_thread(aio_t *); 95 static aio_lio_t *aio_list_get(aio_result_t *); 96 static void lio_set_uerror(void *, int); 97 extern void aio_zerolen(aio_req_t *); 98 static int aiowait(struct timeval *, int, long *); 99 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *); 100 static int aio_unlock_requests(caddr_t iocblist, int iocb_index, 101 aio_req_t *reqlist, aio_t *aiop, model_t model); 102 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max); 103 static int aiosuspend(void *, int, struct timespec *, int, 104 long *, int); 105 static int aliowait(int, void *, int, void *, int); 106 static int aioerror(void *, int); 107 static int aio_cancel(int, void *, long *, int); 108 static int arw(int, int, char *, int, offset_t, aio_result_t *, int); 109 static int aiorw(int, void *, int, int); 110 111 static int alioLF(int, void *, int, void *); 112 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *, 113 aio_result_t *, vnode_t *, int); 114 static int alio32(int, void *, int, void *); 115 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p); 116 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p); 117 118 #ifdef _SYSCALL32_IMPL 119 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *); 120 void aiocb_32ton(aiocb32_t *, aiocb_t *); 121 #endif /* _SYSCALL32_IMPL */ 122 123 /* 124 * implementation specific functions (external) 125 */ 126 void aio_req_free(aio_t *, aio_req_t *); 127 128 /* 129 * Event Port framework 130 */ 131 132 void aio_req_free_port(aio_t *, aio_req_t *); 133 static int aio_port_callback(void *, int *, pid_t, int, void *); 134 135 /* 136 * This is the loadable module wrapper. 137 */ 138 #include <sys/modctl.h> 139 #include <sys/syscall.h> 140 141 #ifdef _LP64 142 143 static struct sysent kaio_sysent = { 144 6, 145 SE_NOUNLOAD | SE_64RVAL | SE_ARGC, 146 (int (*)())kaioc 147 }; 148 149 #ifdef _SYSCALL32_IMPL 150 static struct sysent kaio_sysent32 = { 151 7, 152 SE_NOUNLOAD | SE_64RVAL, 153 kaio 154 }; 155 #endif /* _SYSCALL32_IMPL */ 156 157 #else /* _LP64 */ 158 159 static struct sysent kaio_sysent = { 160 7, 161 SE_NOUNLOAD | SE_32RVAL1, 162 kaio 163 }; 164 165 #endif /* _LP64 */ 166 167 /* 168 * Module linkage information for the kernel. 169 */ 170 171 static struct modlsys modlsys = { 172 &mod_syscallops, 173 "kernel Async I/O", 174 &kaio_sysent 175 }; 176 177 #ifdef _SYSCALL32_IMPL 178 static struct modlsys modlsys32 = { 179 &mod_syscallops32, 180 "kernel Async I/O for 32 bit compatibility", 181 &kaio_sysent32 182 }; 183 #endif /* _SYSCALL32_IMPL */ 184 185 186 static struct modlinkage modlinkage = { 187 MODREV_1, 188 &modlsys, 189 #ifdef _SYSCALL32_IMPL 190 &modlsys32, 191 #endif 192 NULL 193 }; 194 195 int 196 _init(void) 197 { 198 int retval; 199 200 if ((retval = mod_install(&modlinkage)) != 0) 201 return (retval); 202 203 return (0); 204 } 205 206 int 207 _fini(void) 208 { 209 int retval; 210 211 retval = mod_remove(&modlinkage); 212 213 return (retval); 214 } 215 216 int 217 _info(struct modinfo *modinfop) 218 { 219 return (mod_info(&modlinkage, modinfop)); 220 } 221 222 #ifdef _LP64 223 static int64_t 224 kaioc( 225 long a0, 226 long a1, 227 long a2, 228 long a3, 229 long a4, 230 long a5) 231 { 232 int error; 233 long rval = 0; 234 235 switch ((int)a0 & ~AIO_POLL_BIT) { 236 case AIOREAD: 237 error = arw((int)a0, (int)a1, (char *)a2, (int)a3, 238 (offset_t)a4, (aio_result_t *)a5, FREAD); 239 break; 240 case AIOWRITE: 241 error = arw((int)a0, (int)a1, (char *)a2, (int)a3, 242 (offset_t)a4, (aio_result_t *)a5, FWRITE); 243 break; 244 case AIOWAIT: 245 error = aiowait((struct timeval *)a1, (int)a2, &rval); 246 break; 247 case AIOWAITN: 248 error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3, 249 (timespec_t *)a4); 250 break; 251 case AIONOTIFY: 252 error = aionotify(); 253 break; 254 case AIOINIT: 255 error = aioinit(); 256 break; 257 case AIOSTART: 258 error = aiostart(); 259 break; 260 case AIOLIO: 261 error = alio((int)a1, (aiocb_t **)a2, (int)a3, 262 (struct sigevent *)a4); 263 break; 264 case AIOLIOWAIT: 265 error = aliowait((int)a1, (void *)a2, (int)a3, 266 (struct sigevent *)a4, AIO_64); 267 break; 268 case AIOSUSPEND: 269 error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3, 270 (int)a4, &rval, AIO_64); 271 break; 272 case AIOERROR: 273 error = aioerror((void *)a1, AIO_64); 274 break; 275 case AIOAREAD: 276 error = aiorw((int)a0, (void *)a1, FREAD, AIO_64); 277 break; 278 case AIOAWRITE: 279 error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64); 280 break; 281 case AIOCANCEL: 282 error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64); 283 break; 284 285 /* 286 * The large file related stuff is valid only for 287 * 32 bit kernel and not for 64 bit kernel 288 * On 64 bit kernel we convert large file calls 289 * to regular 64bit calls. 290 */ 291 292 default: 293 error = EINVAL; 294 } 295 if (error) 296 return ((int64_t)set_errno(error)); 297 return (rval); 298 } 299 #endif 300 301 static int 302 kaio( 303 ulong_t *uap, 304 rval_t *rvp) 305 { 306 long rval = 0; 307 int error = 0; 308 offset_t off; 309 310 311 rvp->r_vals = 0; 312 #if defined(_LITTLE_ENDIAN) 313 off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4]; 314 #else 315 off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5]; 316 #endif 317 318 switch (uap[0] & ~AIO_POLL_BIT) { 319 /* 320 * It must be the 32 bit system call on 64 bit kernel 321 */ 322 case AIOREAD: 323 return (arw((int)uap[0], (int)uap[1], (char *)uap[2], 324 (int)uap[3], off, (aio_result_t *)uap[6], FREAD)); 325 case AIOWRITE: 326 return (arw((int)uap[0], (int)uap[1], (char *)uap[2], 327 (int)uap[3], off, (aio_result_t *)uap[6], FWRITE)); 328 case AIOWAIT: 329 error = aiowait((struct timeval *)uap[1], (int)uap[2], 330 &rval); 331 break; 332 case AIOWAITN: 333 error = aiowaitn((void *)uap[1], (uint_t)uap[2], 334 (uint_t *)uap[3], (timespec_t *)uap[4]); 335 break; 336 case AIONOTIFY: 337 return (aionotify()); 338 case AIOINIT: 339 return (aioinit()); 340 case AIOSTART: 341 return (aiostart()); 342 case AIOLIO: 343 return (alio32((int)uap[1], (void *)uap[2], (int)uap[3], 344 (void *)uap[4])); 345 case AIOLIOWAIT: 346 return (aliowait((int)uap[1], (void *)uap[2], 347 (int)uap[3], (struct sigevent *)uap[4], AIO_32)); 348 case AIOSUSPEND: 349 error = aiosuspend((void *)uap[1], (int)uap[2], 350 (timespec_t *)uap[3], (int)uap[4], 351 &rval, AIO_32); 352 break; 353 case AIOERROR: 354 return (aioerror((void *)uap[1], AIO_32)); 355 case AIOAREAD: 356 return (aiorw((int)uap[0], (void *)uap[1], 357 FREAD, AIO_32)); 358 case AIOAWRITE: 359 return (aiorw((int)uap[0], (void *)uap[1], 360 FWRITE, AIO_32)); 361 case AIOCANCEL: 362 error = (aio_cancel((int)uap[1], (void *)uap[2], &rval, 363 AIO_32)); 364 break; 365 case AIOLIO64: 366 return (alioLF((int)uap[1], (void *)uap[2], 367 (int)uap[3], (void *)uap[4])); 368 case AIOLIOWAIT64: 369 return (aliowait(uap[1], (void *)uap[2], 370 (int)uap[3], (void *)uap[4], AIO_LARGEFILE)); 371 case AIOSUSPEND64: 372 error = aiosuspend((void *)uap[1], (int)uap[2], 373 (timespec_t *)uap[3], (int)uap[4], &rval, 374 AIO_LARGEFILE); 375 break; 376 case AIOERROR64: 377 return (aioerror((void *)uap[1], AIO_LARGEFILE)); 378 case AIOAREAD64: 379 return (aiorw((int)uap[0], (void *)uap[1], FREAD, 380 AIO_LARGEFILE)); 381 case AIOAWRITE64: 382 return (aiorw((int)uap[0], (void *)uap[1], FWRITE, 383 AIO_LARGEFILE)); 384 case AIOCANCEL64: 385 error = (aio_cancel((int)uap[1], (void *)uap[2], 386 &rval, AIO_LARGEFILE)); 387 break; 388 default: 389 return (EINVAL); 390 } 391 392 rvp->r_val1 = rval; 393 return (error); 394 } 395 396 /* 397 * wake up LWPs in this process that are sleeping in 398 * aiowait(). 399 */ 400 static int 401 aionotify(void) 402 { 403 aio_t *aiop; 404 405 aiop = curproc->p_aio; 406 if (aiop == NULL) 407 return (0); 408 409 mutex_enter(&aiop->aio_mutex); 410 aiop->aio_notifycnt++; 411 cv_broadcast(&aiop->aio_waitcv); 412 mutex_exit(&aiop->aio_mutex); 413 414 return (0); 415 } 416 417 static int 418 timeval2reltime(struct timeval *timout, timestruc_t *rqtime, 419 timestruc_t **rqtp, int *blocking) 420 { 421 #ifdef _SYSCALL32_IMPL 422 struct timeval32 wait_time_32; 423 #endif 424 struct timeval wait_time; 425 model_t model = get_udatamodel(); 426 427 *rqtp = NULL; 428 if (timout == NULL) { /* wait indefinitely */ 429 *blocking = 1; 430 return (0); 431 } 432 433 /* 434 * Need to correctly compare with the -1 passed in for a user 435 * address pointer, with both 32 bit and 64 bit apps. 436 */ 437 if (model == DATAMODEL_NATIVE) { 438 if ((intptr_t)timout == (intptr_t)-1) { /* don't wait */ 439 *blocking = 0; 440 return (0); 441 } 442 443 if (copyin(timout, &wait_time, sizeof (wait_time))) 444 return (EFAULT); 445 } 446 #ifdef _SYSCALL32_IMPL 447 else { 448 /* 449 * -1 from a 32bit app. It will not get sign extended. 450 * don't wait if -1. 451 */ 452 if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) { 453 *blocking = 0; 454 return (0); 455 } 456 457 if (copyin(timout, &wait_time_32, sizeof (wait_time_32))) 458 return (EFAULT); 459 TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32); 460 } 461 #endif /* _SYSCALL32_IMPL */ 462 463 if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) { /* don't wait */ 464 *blocking = 0; 465 return (0); 466 } 467 468 if (wait_time.tv_sec < 0 || 469 wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC) 470 return (EINVAL); 471 472 rqtime->tv_sec = wait_time.tv_sec; 473 rqtime->tv_nsec = wait_time.tv_usec * 1000; 474 *rqtp = rqtime; 475 *blocking = 1; 476 477 return (0); 478 } 479 480 static int 481 timespec2reltime(timespec_t *timout, timestruc_t *rqtime, 482 timestruc_t **rqtp, int *blocking) 483 { 484 #ifdef _SYSCALL32_IMPL 485 timespec32_t wait_time_32; 486 #endif 487 model_t model = get_udatamodel(); 488 489 *rqtp = NULL; 490 if (timout == NULL) { 491 *blocking = 1; 492 return (0); 493 } 494 495 if (model == DATAMODEL_NATIVE) { 496 if (copyin(timout, rqtime, sizeof (*rqtime))) 497 return (EFAULT); 498 } 499 #ifdef _SYSCALL32_IMPL 500 else { 501 if (copyin(timout, &wait_time_32, sizeof (wait_time_32))) 502 return (EFAULT); 503 TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32); 504 } 505 #endif /* _SYSCALL32_IMPL */ 506 507 if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) { 508 *blocking = 0; 509 return (0); 510 } 511 512 if (rqtime->tv_sec < 0 || 513 rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC) 514 return (EINVAL); 515 516 *rqtp = rqtime; 517 *blocking = 1; 518 519 return (0); 520 } 521 522 /*ARGSUSED*/ 523 static int 524 aiowait( 525 struct timeval *timout, 526 int dontblockflg, 527 long *rval) 528 { 529 int error; 530 aio_t *aiop; 531 aio_req_t *reqp; 532 clock_t status; 533 int blocking; 534 int timecheck; 535 timestruc_t rqtime; 536 timestruc_t *rqtp; 537 538 aiop = curproc->p_aio; 539 if (aiop == NULL) 540 return (EINVAL); 541 542 /* 543 * Establish the absolute future time for the timeout. 544 */ 545 error = timeval2reltime(timout, &rqtime, &rqtp, &blocking); 546 if (error) 547 return (error); 548 if (rqtp) { 549 timestruc_t now; 550 timecheck = timechanged; 551 gethrestime(&now); 552 timespecadd(rqtp, &now); 553 } 554 555 mutex_enter(&aiop->aio_mutex); 556 for (;;) { 557 /* process requests on poll queue */ 558 if (aiop->aio_pollq) { 559 mutex_exit(&aiop->aio_mutex); 560 aio_cleanup(0); 561 mutex_enter(&aiop->aio_mutex); 562 } 563 if ((reqp = aio_req_remove(NULL)) != NULL) { 564 *rval = (long)reqp->aio_req_resultp; 565 break; 566 } 567 /* user-level done queue might not be empty */ 568 if (aiop->aio_notifycnt > 0) { 569 aiop->aio_notifycnt--; 570 *rval = 1; 571 break; 572 } 573 /* don't block if no outstanding aio */ 574 if (aiop->aio_outstanding == 0 && dontblockflg) { 575 error = EINVAL; 576 break; 577 } 578 if (blocking) { 579 status = cv_waituntil_sig(&aiop->aio_waitcv, 580 &aiop->aio_mutex, rqtp, timecheck); 581 582 if (status > 0) /* check done queue again */ 583 continue; 584 if (status == 0) { /* interrupted by a signal */ 585 error = EINTR; 586 *rval = -1; 587 } else { /* timer expired */ 588 error = ETIME; 589 } 590 } 591 break; 592 } 593 mutex_exit(&aiop->aio_mutex); 594 if (reqp) { 595 aphysio_unlock(reqp); 596 aio_copyout_result(reqp); 597 mutex_enter(&aiop->aio_mutex); 598 aio_req_free(aiop, reqp); 599 mutex_exit(&aiop->aio_mutex); 600 } 601 return (error); 602 } 603 604 /* 605 * aiowaitn can be used to reap completed asynchronous requests submitted with 606 * lio_listio, aio_read or aio_write. 607 * This function only reaps asynchronous raw I/Os. 608 */ 609 610 /*ARGSUSED*/ 611 static int 612 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout) 613 { 614 int error = 0; 615 aio_t *aiop; 616 aio_req_t *reqlist = NULL; 617 caddr_t iocblist = NULL; /* array of iocb ptr's */ 618 uint_t waitcnt, cnt = 0; /* iocb cnt */ 619 size_t iocbsz; /* users iocb size */ 620 size_t riocbsz; /* returned iocb size */ 621 int iocb_index = 0; 622 model_t model = get_udatamodel(); 623 int blocking = 1; 624 int timecheck; 625 timestruc_t rqtime; 626 timestruc_t *rqtp; 627 628 aiop = curproc->p_aio; 629 if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX) 630 return (EINVAL); 631 632 if (aiop->aio_outstanding == 0) 633 return (EAGAIN); 634 635 if (copyin(nwait, &waitcnt, sizeof (uint_t))) 636 return (EFAULT); 637 638 /* set *nwait to zero, if we must return prematurely */ 639 if (copyout(&cnt, nwait, sizeof (uint_t))) 640 return (EFAULT); 641 642 if (waitcnt == 0) { 643 blocking = 0; 644 rqtp = NULL; 645 waitcnt = nent; 646 } else { 647 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking); 648 if (error) 649 return (error); 650 } 651 652 if (model == DATAMODEL_NATIVE) 653 iocbsz = (sizeof (aiocb_t *) * nent); 654 #ifdef _SYSCALL32_IMPL 655 else 656 iocbsz = (sizeof (caddr32_t) * nent); 657 #endif /* _SYSCALL32_IMPL */ 658 659 /* 660 * Only one aio_waitn call is allowed at a time. 661 * The active aio_waitn will collect all requests 662 * out of the "done" list and if necessary it will wait 663 * for some/all pending requests to fulfill the nwait 664 * parameter. 665 * A second or further aio_waitn calls will sleep here 666 * until the active aio_waitn finishes and leaves the kernel 667 * If the second call does not block (poll), then return 668 * immediately with the error code : EAGAIN. 669 * If the second call should block, then sleep here, but 670 * do not touch the timeout. The timeout starts when this 671 * aio_waitn-call becomes active. 672 */ 673 674 mutex_enter(&aiop->aio_mutex); 675 676 while (aiop->aio_flags & AIO_WAITN) { 677 if (blocking == 0) { 678 mutex_exit(&aiop->aio_mutex); 679 return (EAGAIN); 680 } 681 682 /* block, no timeout */ 683 aiop->aio_flags |= AIO_WAITN_PENDING; 684 if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) { 685 mutex_exit(&aiop->aio_mutex); 686 return (EINTR); 687 } 688 } 689 690 /* 691 * Establish the absolute future time for the timeout. 692 */ 693 if (rqtp) { 694 timestruc_t now; 695 timecheck = timechanged; 696 gethrestime(&now); 697 timespecadd(rqtp, &now); 698 } 699 700 if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) { 701 kmem_free(aiop->aio_iocb, aiop->aio_iocbsz); 702 aiop->aio_iocb = NULL; 703 } 704 705 if (aiop->aio_iocb == NULL) { 706 iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP); 707 if (iocblist == NULL) { 708 mutex_exit(&aiop->aio_mutex); 709 return (ENOMEM); 710 } 711 aiop->aio_iocb = (aiocb_t **)iocblist; 712 aiop->aio_iocbsz = iocbsz; 713 } else { 714 iocblist = (char *)aiop->aio_iocb; 715 } 716 717 aiop->aio_waitncnt = waitcnt; 718 aiop->aio_flags |= AIO_WAITN; 719 720 for (;;) { 721 /* push requests on poll queue to done queue */ 722 if (aiop->aio_pollq) { 723 mutex_exit(&aiop->aio_mutex); 724 aio_cleanup(0); 725 mutex_enter(&aiop->aio_mutex); 726 } 727 728 /* check for requests on done queue */ 729 if (aiop->aio_doneq) { 730 cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt); 731 aiop->aio_waitncnt = waitcnt - cnt; 732 } 733 734 /* user-level done queue might not be empty */ 735 if (aiop->aio_notifycnt > 0) { 736 aiop->aio_notifycnt--; 737 error = 0; 738 break; 739 } 740 741 /* 742 * if we are here second time as a result of timer 743 * expiration, we reset error if there are enough 744 * aiocb's to satisfy request. 745 * We return also if all requests are already done 746 * and we picked up the whole done queue. 747 */ 748 749 if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 && 750 aiop->aio_doneq == NULL)) { 751 error = 0; 752 break; 753 } 754 755 if ((cnt < waitcnt) && blocking) { 756 int rval = cv_waituntil_sig(&aiop->aio_waitcv, 757 &aiop->aio_mutex, rqtp, timecheck); 758 if (rval > 0) 759 continue; 760 if (rval < 0) { 761 error = ETIME; 762 blocking = 0; 763 continue; 764 } 765 error = EINTR; 766 } 767 break; 768 } 769 770 mutex_exit(&aiop->aio_mutex); 771 772 if (cnt > 0) { 773 774 iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist, 775 aiop, model); 776 777 if (model == DATAMODEL_NATIVE) 778 riocbsz = (sizeof (aiocb_t *) * cnt); 779 #ifdef _SYSCALL32_IMPL 780 else 781 riocbsz = (sizeof (caddr32_t) * cnt); 782 #endif /* _SYSCALL32_IMPL */ 783 784 if (copyout(iocblist, uiocb, riocbsz) || 785 copyout(&cnt, nwait, sizeof (uint_t))) 786 error = EFAULT; 787 } 788 789 /* check if there is another thread waiting for execution */ 790 mutex_enter(&aiop->aio_mutex); 791 aiop->aio_flags &= ~AIO_WAITN; 792 if (aiop->aio_flags & AIO_WAITN_PENDING) { 793 aiop->aio_flags &= ~AIO_WAITN_PENDING; 794 cv_signal(&aiop->aio_waitncv); 795 } 796 mutex_exit(&aiop->aio_mutex); 797 798 return (error); 799 } 800 801 /* 802 * aio_unlock_requests 803 * copyouts the result of the request as well as the return value. 804 * It builds the list of completed asynchronous requests, 805 * unlocks the allocated memory ranges and 806 * put the aio request structure back into the free list. 807 */ 808 809 static int 810 aio_unlock_requests( 811 caddr_t iocblist, 812 int iocb_index, 813 aio_req_t *reqlist, 814 aio_t *aiop, 815 model_t model) 816 { 817 aio_req_t *reqp, *nreqp; 818 819 if (model == DATAMODEL_NATIVE) { 820 for (reqp = reqlist; reqp != NULL; reqp = nreqp) { 821 (((caddr_t *)iocblist)[iocb_index++]) = 822 reqp->aio_req_iocb.iocb; 823 nreqp = reqp->aio_req_next; 824 aphysio_unlock(reqp); 825 aio_copyout_result(reqp); 826 mutex_enter(&aiop->aio_mutex); 827 aio_req_free(aiop, reqp); 828 mutex_exit(&aiop->aio_mutex); 829 } 830 } 831 #ifdef _SYSCALL32_IMPL 832 else { 833 for (reqp = reqlist; reqp != NULL; reqp = nreqp) { 834 ((caddr32_t *)iocblist)[iocb_index++] = 835 reqp->aio_req_iocb.iocb32; 836 nreqp = reqp->aio_req_next; 837 aphysio_unlock(reqp); 838 aio_copyout_result(reqp); 839 mutex_enter(&aiop->aio_mutex); 840 aio_req_free(aiop, reqp); 841 mutex_exit(&aiop->aio_mutex); 842 } 843 } 844 #endif /* _SYSCALL32_IMPL */ 845 return (iocb_index); 846 } 847 848 /* 849 * aio_reqlist_concat 850 * moves "max" elements from the done queue to the reqlist queue and removes 851 * the AIO_DONEQ flag. 852 * - reqlist queue is a simple linked list 853 * - done queue is a double linked list 854 */ 855 856 static int 857 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max) 858 { 859 aio_req_t *q2, *q2work, *list; 860 int count = 0; 861 862 list = *reqlist; 863 q2 = aiop->aio_doneq; 864 q2work = q2; 865 while (max-- > 0) { 866 q2work->aio_req_flags &= ~AIO_DONEQ; 867 q2work = q2work->aio_req_next; 868 count++; 869 if (q2work == q2) 870 break; 871 } 872 873 if (q2work == q2) { 874 /* all elements revised */ 875 q2->aio_req_prev->aio_req_next = list; 876 list = q2; 877 aiop->aio_doneq = NULL; 878 } else { 879 /* 880 * max < elements in the doneq 881 * detach only the required amount of elements 882 * out of the doneq 883 */ 884 q2work->aio_req_prev->aio_req_next = list; 885 list = q2; 886 887 aiop->aio_doneq = q2work; 888 q2work->aio_req_prev = q2->aio_req_prev; 889 q2->aio_req_prev->aio_req_next = q2work; 890 } 891 *reqlist = list; 892 return (count); 893 } 894 895 /*ARGSUSED*/ 896 static int 897 aiosuspend( 898 void *aiocb, 899 int nent, 900 struct timespec *timout, 901 int flag, 902 long *rval, 903 int run_mode) 904 { 905 int error; 906 aio_t *aiop; 907 aio_req_t *reqp, *found, *next; 908 caddr_t cbplist = NULL; 909 aiocb_t *cbp, **ucbp; 910 #ifdef _SYSCALL32_IMPL 911 aiocb32_t *cbp32; 912 caddr32_t *ucbp32; 913 #endif /* _SYSCALL32_IMPL */ 914 aiocb64_32_t *cbp64; 915 int rv; 916 int i; 917 size_t ssize; 918 model_t model = get_udatamodel(); 919 int blocking; 920 int timecheck; 921 timestruc_t rqtime; 922 timestruc_t *rqtp; 923 924 aiop = curproc->p_aio; 925 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 926 return (EINVAL); 927 928 /* 929 * Establish the absolute future time for the timeout. 930 */ 931 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking); 932 if (error) 933 return (error); 934 if (rqtp) { 935 timestruc_t now; 936 timecheck = timechanged; 937 gethrestime(&now); 938 timespecadd(rqtp, &now); 939 } 940 941 /* 942 * If we are not blocking and there's no IO complete 943 * skip aiocb copyin. 944 */ 945 if (!blocking && (aiop->aio_pollq == NULL) && 946 (aiop->aio_doneq == NULL)) { 947 return (EAGAIN); 948 } 949 950 if (model == DATAMODEL_NATIVE) 951 ssize = (sizeof (aiocb_t *) * nent); 952 #ifdef _SYSCALL32_IMPL 953 else 954 ssize = (sizeof (caddr32_t) * nent); 955 #endif /* _SYSCALL32_IMPL */ 956 957 cbplist = kmem_alloc(ssize, KM_NOSLEEP); 958 if (cbplist == NULL) 959 return (ENOMEM); 960 961 if (copyin(aiocb, cbplist, ssize)) { 962 error = EFAULT; 963 goto done; 964 } 965 966 found = NULL; 967 /* 968 * we need to get the aio_cleanupq_mutex since we call 969 * aio_req_done(). 970 */ 971 mutex_enter(&aiop->aio_cleanupq_mutex); 972 mutex_enter(&aiop->aio_mutex); 973 for (;;) { 974 /* push requests on poll queue to done queue */ 975 if (aiop->aio_pollq) { 976 mutex_exit(&aiop->aio_mutex); 977 mutex_exit(&aiop->aio_cleanupq_mutex); 978 aio_cleanup(0); 979 mutex_enter(&aiop->aio_cleanupq_mutex); 980 mutex_enter(&aiop->aio_mutex); 981 } 982 /* check for requests on done queue */ 983 if (aiop->aio_doneq) { 984 if (model == DATAMODEL_NATIVE) 985 ucbp = (aiocb_t **)cbplist; 986 #ifdef _SYSCALL32_IMPL 987 else 988 ucbp32 = (caddr32_t *)cbplist; 989 #endif /* _SYSCALL32_IMPL */ 990 for (i = 0; i < nent; i++) { 991 if (model == DATAMODEL_NATIVE) { 992 if ((cbp = *ucbp++) == NULL) 993 continue; 994 if (run_mode != AIO_LARGEFILE) 995 reqp = aio_req_done( 996 &cbp->aio_resultp); 997 else { 998 cbp64 = (aiocb64_32_t *)cbp; 999 reqp = aio_req_done( 1000 &cbp64->aio_resultp); 1001 } 1002 } 1003 #ifdef _SYSCALL32_IMPL 1004 else { 1005 if (run_mode == AIO_32) { 1006 if ((cbp32 = 1007 (aiocb32_t *)(uintptr_t) 1008 *ucbp32++) == NULL) 1009 continue; 1010 reqp = aio_req_done( 1011 &cbp32->aio_resultp); 1012 } else if (run_mode == AIO_LARGEFILE) { 1013 if ((cbp64 = 1014 (aiocb64_32_t *)(uintptr_t) 1015 *ucbp32++) == NULL) 1016 continue; 1017 reqp = aio_req_done( 1018 &cbp64->aio_resultp); 1019 } 1020 1021 } 1022 #endif /* _SYSCALL32_IMPL */ 1023 if (reqp) { 1024 reqp->aio_req_next = found; 1025 found = reqp; 1026 } 1027 if (aiop->aio_doneq == NULL) 1028 break; 1029 } 1030 if (found) 1031 break; 1032 } 1033 if (aiop->aio_notifycnt > 0) { 1034 /* 1035 * nothing on the kernel's queue. the user 1036 * has notified the kernel that it has items 1037 * on a user-level queue. 1038 */ 1039 aiop->aio_notifycnt--; 1040 *rval = 1; 1041 error = 0; 1042 break; 1043 } 1044 /* don't block if nothing is outstanding */ 1045 if (aiop->aio_outstanding == 0) { 1046 error = EAGAIN; 1047 break; 1048 } 1049 if (blocking) { 1050 /* 1051 * drop the aio_cleanupq_mutex as we are 1052 * going to block. 1053 */ 1054 mutex_exit(&aiop->aio_cleanupq_mutex); 1055 rv = cv_waituntil_sig(&aiop->aio_waitcv, 1056 &aiop->aio_mutex, rqtp, timecheck); 1057 /* 1058 * we have to drop aio_mutex and 1059 * grab it in the right order. 1060 */ 1061 mutex_exit(&aiop->aio_mutex); 1062 mutex_enter(&aiop->aio_cleanupq_mutex); 1063 mutex_enter(&aiop->aio_mutex); 1064 if (rv > 0) /* check done queue again */ 1065 continue; 1066 if (rv == 0) /* interrupted by a signal */ 1067 error = EINTR; 1068 else /* timer expired */ 1069 error = ETIME; 1070 } else { 1071 error = EAGAIN; 1072 } 1073 break; 1074 } 1075 mutex_exit(&aiop->aio_mutex); 1076 mutex_exit(&aiop->aio_cleanupq_mutex); 1077 for (reqp = found; reqp != NULL; reqp = next) { 1078 next = reqp->aio_req_next; 1079 aphysio_unlock(reqp); 1080 aio_copyout_result(reqp); 1081 mutex_enter(&aiop->aio_mutex); 1082 aio_req_free(aiop, reqp); 1083 mutex_exit(&aiop->aio_mutex); 1084 } 1085 done: 1086 kmem_free(cbplist, ssize); 1087 return (error); 1088 } 1089 1090 /* 1091 * initialize aio by allocating an aio_t struct for this 1092 * process. 1093 */ 1094 static int 1095 aioinit(void) 1096 { 1097 proc_t *p = curproc; 1098 aio_t *aiop; 1099 mutex_enter(&p->p_lock); 1100 if ((aiop = p->p_aio) == NULL) { 1101 aiop = aio_aiop_alloc(); 1102 p->p_aio = aiop; 1103 } 1104 mutex_exit(&p->p_lock); 1105 if (aiop == NULL) 1106 return (ENOMEM); 1107 return (0); 1108 } 1109 1110 /* 1111 * start a special thread that will cleanup after aio requests 1112 * that are preventing a segment from being unmapped. as_unmap() 1113 * blocks until all phsyio to this segment is completed. this 1114 * doesn't happen until all the pages in this segment are not 1115 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio 1116 * requests still outstanding. this special thread will make sure 1117 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed. 1118 * 1119 * this function will return an error if the process has only 1120 * one LWP. the assumption is that the caller is a separate LWP 1121 * that remains blocked in the kernel for the life of this process. 1122 */ 1123 static int 1124 aiostart(void) 1125 { 1126 proc_t *p = curproc; 1127 aio_t *aiop; 1128 int first, error = 0; 1129 1130 if (p->p_lwpcnt == 1) 1131 return (EDEADLK); 1132 mutex_enter(&p->p_lock); 1133 if ((aiop = p->p_aio) == NULL) 1134 error = EINVAL; 1135 else { 1136 first = aiop->aio_ok; 1137 if (aiop->aio_ok == 0) 1138 aiop->aio_ok = 1; 1139 } 1140 mutex_exit(&p->p_lock); 1141 if (error == 0 && first == 0) { 1142 return (aio_cleanup_thread(aiop)); 1143 /* should return only to exit */ 1144 } 1145 return (error); 1146 } 1147 1148 /* 1149 * Associate an aiocb with a port. 1150 * This function is used by aiorw() to associate a transaction with a port. 1151 * Allocate an event port structure (port_alloc_event()) and store the 1152 * delivered user pointer (portnfy_user) in the portkev_user field of the 1153 * port_kevent_t structure.. 1154 * The aio_req_portkev pointer in the aio_req_t structure was added to identify 1155 * the port association. 1156 */ 1157 1158 static int 1159 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp, 1160 aio_req_t *reqp, int event) 1161 { 1162 port_kevent_t *pkevp = NULL; 1163 int error; 1164 1165 error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT, 1166 PORT_SOURCE_AIO, &pkevp); 1167 if (error) { 1168 if ((error == ENOMEM) || (error == EAGAIN)) 1169 error = EAGAIN; 1170 else 1171 error = EINVAL; 1172 } else { 1173 port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user, 1174 aio_port_callback, reqp); 1175 pkevp->portkev_events = event; 1176 reqp->aio_req_portkev = pkevp; 1177 reqp->aio_req_port = pntfy->portnfy_port; 1178 } 1179 return (error); 1180 } 1181 1182 #ifdef _LP64 1183 1184 /* 1185 * Asynchronous list IO. A chain of aiocb's are copied in 1186 * one at a time. If the aiocb is invalid, it is skipped. 1187 * For each aiocb, the appropriate driver entry point is 1188 * called. Optimize for the common case where the list 1189 * of requests is to the same file descriptor. 1190 * 1191 * One possible optimization is to define a new driver entry 1192 * point that supports a list of IO requests. Whether this 1193 * improves performance depends somewhat on the driver's 1194 * locking strategy. Processing a list could adversely impact 1195 * the driver's interrupt latency. 1196 */ 1197 static int 1198 alio( 1199 int mode_arg, 1200 aiocb_t **aiocb_arg, 1201 int nent, 1202 struct sigevent *sigev) 1203 { 1204 file_t *fp; 1205 file_t *prev_fp = NULL; 1206 int prev_mode = -1; 1207 struct vnode *vp; 1208 aio_lio_t *head; 1209 aio_req_t *reqp; 1210 aio_t *aiop; 1211 caddr_t cbplist; 1212 aiocb_t cb; 1213 aiocb_t *aiocb = &cb; 1214 aiocb_t *cbp; 1215 aiocb_t **ucbp; 1216 struct sigevent sigevk; 1217 sigqueue_t *sqp; 1218 int (*aio_func)(); 1219 int mode; 1220 int error = 0; 1221 int aio_errors = 0; 1222 int i; 1223 size_t ssize; 1224 int deadhead = 0; 1225 int aio_notsupported = 0; 1226 int lio_head_port; 1227 int aio_port; 1228 int aio_thread; 1229 port_kevent_t *pkevtp = NULL; 1230 int portused = 0; 1231 port_notify_t pnotify; 1232 int event; 1233 1234 aiop = curproc->p_aio; 1235 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 1236 return (EINVAL); 1237 1238 ssize = (sizeof (aiocb_t *) * nent); 1239 cbplist = kmem_alloc(ssize, KM_SLEEP); 1240 ucbp = (aiocb_t **)cbplist; 1241 1242 if (copyin(aiocb_arg, cbplist, ssize) || 1243 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) { 1244 kmem_free(cbplist, ssize); 1245 return (EFAULT); 1246 } 1247 1248 /* Event Ports */ 1249 if (sigev && 1250 (sigevk.sigev_notify == SIGEV_THREAD || 1251 sigevk.sigev_notify == SIGEV_PORT)) { 1252 if (sigevk.sigev_notify == SIGEV_THREAD) { 1253 pnotify.portnfy_port = sigevk.sigev_signo; 1254 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 1255 } else if (copyin(sigevk.sigev_value.sival_ptr, 1256 &pnotify, sizeof (pnotify))) { 1257 kmem_free(cbplist, ssize); 1258 return (EFAULT); 1259 } 1260 error = port_alloc_event(pnotify.portnfy_port, 1261 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 1262 if (error) { 1263 if (error == ENOMEM || error == EAGAIN) 1264 error = EAGAIN; 1265 else 1266 error = EINVAL; 1267 kmem_free(cbplist, ssize); 1268 return (error); 1269 } 1270 lio_head_port = pnotify.portnfy_port; 1271 portused = 1; 1272 } 1273 1274 /* 1275 * a list head should be allocated if notification is 1276 * enabled for this list. 1277 */ 1278 head = NULL; 1279 1280 if (mode_arg == LIO_WAIT || sigev) { 1281 mutex_enter(&aiop->aio_mutex); 1282 error = aio_lio_alloc(&head); 1283 mutex_exit(&aiop->aio_mutex); 1284 if (error) 1285 goto done; 1286 deadhead = 1; 1287 head->lio_nent = nent; 1288 head->lio_refcnt = nent; 1289 head->lio_port = -1; 1290 head->lio_portkev = NULL; 1291 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 1292 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 1293 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 1294 if (sqp == NULL) { 1295 error = EAGAIN; 1296 goto done; 1297 } 1298 sqp->sq_func = NULL; 1299 sqp->sq_next = NULL; 1300 sqp->sq_info.si_code = SI_ASYNCIO; 1301 sqp->sq_info.si_pid = curproc->p_pid; 1302 sqp->sq_info.si_ctid = PRCTID(curproc); 1303 sqp->sq_info.si_zoneid = getzoneid(); 1304 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 1305 sqp->sq_info.si_signo = sigevk.sigev_signo; 1306 sqp->sq_info.si_value = sigevk.sigev_value; 1307 head->lio_sigqp = sqp; 1308 } else { 1309 head->lio_sigqp = NULL; 1310 } 1311 if (pkevtp) { 1312 /* 1313 * Prepare data to send when list of aiocb's 1314 * has completed. 1315 */ 1316 port_init_event(pkevtp, (uintptr_t)sigev, 1317 (void *)(uintptr_t)pnotify.portnfy_user, 1318 NULL, head); 1319 pkevtp->portkev_events = AIOLIO; 1320 head->lio_portkev = pkevtp; 1321 head->lio_port = pnotify.portnfy_port; 1322 } 1323 } 1324 1325 for (i = 0; i < nent; i++, ucbp++) { 1326 1327 cbp = *ucbp; 1328 /* skip entry if it can't be copied. */ 1329 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) { 1330 if (head) { 1331 mutex_enter(&aiop->aio_mutex); 1332 head->lio_nent--; 1333 head->lio_refcnt--; 1334 mutex_exit(&aiop->aio_mutex); 1335 } 1336 continue; 1337 } 1338 1339 /* skip if opcode for aiocb is LIO_NOP */ 1340 mode = aiocb->aio_lio_opcode; 1341 if (mode == LIO_NOP) { 1342 cbp = NULL; 1343 if (head) { 1344 mutex_enter(&aiop->aio_mutex); 1345 head->lio_nent--; 1346 head->lio_refcnt--; 1347 mutex_exit(&aiop->aio_mutex); 1348 } 1349 continue; 1350 } 1351 1352 /* increment file descriptor's ref count. */ 1353 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 1354 lio_set_uerror(&cbp->aio_resultp, EBADF); 1355 if (head) { 1356 mutex_enter(&aiop->aio_mutex); 1357 head->lio_nent--; 1358 head->lio_refcnt--; 1359 mutex_exit(&aiop->aio_mutex); 1360 } 1361 aio_errors++; 1362 continue; 1363 } 1364 1365 /* 1366 * check the permission of the partition 1367 */ 1368 if ((fp->f_flag & mode) == 0) { 1369 releasef(aiocb->aio_fildes); 1370 lio_set_uerror(&cbp->aio_resultp, EBADF); 1371 if (head) { 1372 mutex_enter(&aiop->aio_mutex); 1373 head->lio_nent--; 1374 head->lio_refcnt--; 1375 mutex_exit(&aiop->aio_mutex); 1376 } 1377 aio_errors++; 1378 continue; 1379 } 1380 1381 /* 1382 * common case where requests are to the same fd 1383 * for the same r/w operation. 1384 * for UFS, need to set EBADFD 1385 */ 1386 vp = fp->f_vnode; 1387 if (fp != prev_fp || mode != prev_mode) { 1388 aio_func = check_vp(vp, mode); 1389 if (aio_func == NULL) { 1390 prev_fp = NULL; 1391 releasef(aiocb->aio_fildes); 1392 lio_set_uerror(&cbp->aio_resultp, EBADFD); 1393 aio_notsupported++; 1394 if (head) { 1395 mutex_enter(&aiop->aio_mutex); 1396 head->lio_nent--; 1397 head->lio_refcnt--; 1398 mutex_exit(&aiop->aio_mutex); 1399 } 1400 continue; 1401 } else { 1402 prev_fp = fp; 1403 prev_mode = mode; 1404 } 1405 } 1406 1407 error = aio_req_setup(&reqp, aiop, aiocb, 1408 &cbp->aio_resultp, vp, 0); 1409 if (error) { 1410 releasef(aiocb->aio_fildes); 1411 lio_set_uerror(&cbp->aio_resultp, error); 1412 if (head) { 1413 mutex_enter(&aiop->aio_mutex); 1414 head->lio_nent--; 1415 head->lio_refcnt--; 1416 mutex_exit(&aiop->aio_mutex); 1417 } 1418 aio_errors++; 1419 continue; 1420 } 1421 1422 reqp->aio_req_lio = head; 1423 deadhead = 0; 1424 1425 /* 1426 * Set the errno field now before sending the request to 1427 * the driver to avoid a race condition 1428 */ 1429 (void) suword32(&cbp->aio_resultp.aio_errno, 1430 EINPROGRESS); 1431 1432 reqp->aio_req_iocb.iocb = (caddr_t)cbp; 1433 1434 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE; 1435 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 1436 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 1437 if (aio_port | aio_thread) { 1438 port_kevent_t *lpkevp; 1439 /* 1440 * Prepare data to send with each aiocb completed. 1441 */ 1442 if (aio_port) { 1443 void *paddr = 1444 aiocb->aio_sigevent.sigev_value.sival_ptr; 1445 if (copyin(paddr, &pnotify, sizeof (pnotify))) 1446 error = EFAULT; 1447 } else { /* aio_thread */ 1448 pnotify.portnfy_port = 1449 aiocb->aio_sigevent.sigev_signo; 1450 pnotify.portnfy_user = 1451 aiocb->aio_sigevent.sigev_value.sival_ptr; 1452 } 1453 if (error) 1454 /* EMPTY */; 1455 else if (pkevtp != NULL && 1456 pnotify.portnfy_port == lio_head_port) 1457 error = port_dup_event(pkevtp, &lpkevp, 1458 PORT_ALLOC_DEFAULT); 1459 else 1460 error = port_alloc_event(pnotify.portnfy_port, 1461 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 1462 &lpkevp); 1463 if (error == 0) { 1464 port_init_event(lpkevp, (uintptr_t)cbp, 1465 (void *)(uintptr_t)pnotify.portnfy_user, 1466 aio_port_callback, reqp); 1467 lpkevp->portkev_events = event; 1468 reqp->aio_req_portkev = lpkevp; 1469 reqp->aio_req_port = pnotify.portnfy_port; 1470 } 1471 } 1472 1473 /* 1474 * send the request to driver. 1475 */ 1476 if (error == 0) { 1477 if (aiocb->aio_nbytes == 0) { 1478 clear_active_fd(aiocb->aio_fildes); 1479 aio_zerolen(reqp); 1480 continue; 1481 } 1482 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 1483 CRED()); 1484 } 1485 1486 /* 1487 * the fd's ref count is not decremented until the IO has 1488 * completed unless there was an error. 1489 */ 1490 if (error) { 1491 releasef(aiocb->aio_fildes); 1492 lio_set_uerror(&cbp->aio_resultp, error); 1493 if (head) { 1494 mutex_enter(&aiop->aio_mutex); 1495 head->lio_nent--; 1496 head->lio_refcnt--; 1497 mutex_exit(&aiop->aio_mutex); 1498 } 1499 if (error == ENOTSUP) 1500 aio_notsupported++; 1501 else 1502 aio_errors++; 1503 lio_set_error(reqp, portused); 1504 } else { 1505 clear_active_fd(aiocb->aio_fildes); 1506 } 1507 } 1508 1509 if (aio_notsupported) { 1510 error = ENOTSUP; 1511 } else if (aio_errors) { 1512 /* 1513 * return EIO if any request failed 1514 */ 1515 error = EIO; 1516 } 1517 1518 if (mode_arg == LIO_WAIT) { 1519 mutex_enter(&aiop->aio_mutex); 1520 while (head->lio_refcnt > 0) { 1521 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 1522 mutex_exit(&aiop->aio_mutex); 1523 error = EINTR; 1524 goto done; 1525 } 1526 } 1527 mutex_exit(&aiop->aio_mutex); 1528 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64); 1529 } 1530 1531 done: 1532 kmem_free(cbplist, ssize); 1533 if (deadhead) { 1534 if (head->lio_sigqp) 1535 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 1536 if (head->lio_portkev) 1537 port_free_event(head->lio_portkev); 1538 kmem_free(head, sizeof (aio_lio_t)); 1539 } 1540 return (error); 1541 } 1542 1543 #endif /* _LP64 */ 1544 1545 /* 1546 * Asynchronous list IO. 1547 * If list I/O is called with LIO_WAIT it can still return 1548 * before all the I/O's are completed if a signal is caught 1549 * or if the list include UFS I/O requests. If this happens, 1550 * libaio will call aliowait() to wait for the I/O's to 1551 * complete 1552 */ 1553 /*ARGSUSED*/ 1554 static int 1555 aliowait( 1556 int mode, 1557 void *aiocb, 1558 int nent, 1559 void *sigev, 1560 int run_mode) 1561 { 1562 aio_lio_t *head; 1563 aio_t *aiop; 1564 caddr_t cbplist; 1565 aiocb_t *cbp, **ucbp; 1566 #ifdef _SYSCALL32_IMPL 1567 aiocb32_t *cbp32; 1568 caddr32_t *ucbp32; 1569 aiocb64_32_t *cbp64; 1570 #endif 1571 int error = 0; 1572 int i; 1573 size_t ssize = 0; 1574 model_t model = get_udatamodel(); 1575 1576 aiop = curproc->p_aio; 1577 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 1578 return (EINVAL); 1579 1580 if (model == DATAMODEL_NATIVE) 1581 ssize = (sizeof (aiocb_t *) * nent); 1582 #ifdef _SYSCALL32_IMPL 1583 else 1584 ssize = (sizeof (caddr32_t) * nent); 1585 #endif /* _SYSCALL32_IMPL */ 1586 1587 if (ssize == 0) 1588 return (EINVAL); 1589 1590 cbplist = kmem_alloc(ssize, KM_SLEEP); 1591 1592 if (model == DATAMODEL_NATIVE) 1593 ucbp = (aiocb_t **)cbplist; 1594 #ifdef _SYSCALL32_IMPL 1595 else 1596 ucbp32 = (caddr32_t *)cbplist; 1597 #endif /* _SYSCALL32_IMPL */ 1598 1599 if (copyin(aiocb, cbplist, ssize)) { 1600 error = EFAULT; 1601 goto done; 1602 } 1603 1604 /* 1605 * To find the list head, we go through the 1606 * list of aiocb structs, find the request 1607 * its for, then get the list head that reqp 1608 * points to 1609 */ 1610 head = NULL; 1611 1612 for (i = 0; i < nent; i++) { 1613 if (model == DATAMODEL_NATIVE) { 1614 /* 1615 * Since we are only checking for a NULL pointer 1616 * Following should work on both native data sizes 1617 * as well as for largefile aiocb. 1618 */ 1619 if ((cbp = *ucbp++) == NULL) 1620 continue; 1621 if (run_mode != AIO_LARGEFILE) 1622 if (head = aio_list_get(&cbp->aio_resultp)) 1623 break; 1624 else { 1625 /* 1626 * This is a case when largefile call is 1627 * made on 32 bit kernel. 1628 * Treat each pointer as pointer to 1629 * aiocb64_32 1630 */ 1631 if (head = aio_list_get((aio_result_t *) 1632 &(((aiocb64_32_t *)cbp)->aio_resultp))) 1633 break; 1634 } 1635 } 1636 #ifdef _SYSCALL32_IMPL 1637 else { 1638 if (run_mode == AIO_LARGEFILE) { 1639 if ((cbp64 = (aiocb64_32_t *) 1640 (uintptr_t)*ucbp32++) == NULL) 1641 continue; 1642 if (head = aio_list_get((aio_result_t *) 1643 &cbp64->aio_resultp)) 1644 break; 1645 } else if (run_mode == AIO_32) { 1646 if ((cbp32 = (aiocb32_t *) 1647 (uintptr_t)*ucbp32++) == NULL) 1648 continue; 1649 if (head = aio_list_get((aio_result_t *) 1650 &cbp32->aio_resultp)) 1651 break; 1652 } 1653 } 1654 #endif /* _SYSCALL32_IMPL */ 1655 } 1656 1657 if (head == NULL) { 1658 error = EINVAL; 1659 goto done; 1660 } 1661 1662 mutex_enter(&aiop->aio_mutex); 1663 while (head->lio_refcnt > 0) { 1664 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 1665 mutex_exit(&aiop->aio_mutex); 1666 error = EINTR; 1667 goto done; 1668 } 1669 } 1670 mutex_exit(&aiop->aio_mutex); 1671 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode); 1672 done: 1673 kmem_free(cbplist, ssize); 1674 return (error); 1675 } 1676 1677 aio_lio_t * 1678 aio_list_get(aio_result_t *resultp) 1679 { 1680 aio_lio_t *head = NULL; 1681 aio_t *aiop; 1682 aio_req_t **bucket; 1683 aio_req_t *reqp; 1684 long index; 1685 1686 aiop = curproc->p_aio; 1687 if (aiop == NULL) 1688 return (NULL); 1689 1690 if (resultp) { 1691 index = AIO_HASH(resultp); 1692 bucket = &aiop->aio_hash[index]; 1693 for (reqp = *bucket; reqp != NULL; 1694 reqp = reqp->aio_hash_next) { 1695 if (reqp->aio_req_resultp == resultp) { 1696 head = reqp->aio_req_lio; 1697 return (head); 1698 } 1699 } 1700 } 1701 return (NULL); 1702 } 1703 1704 1705 static void 1706 lio_set_uerror(void *resultp, int error) 1707 { 1708 /* 1709 * the resultp field is a pointer to where the 1710 * error should be written out to the user's 1711 * aiocb. 1712 * 1713 */ 1714 if (get_udatamodel() == DATAMODEL_NATIVE) { 1715 (void) sulword(&((aio_result_t *)resultp)->aio_return, 1716 (ssize_t)-1); 1717 (void) suword32(&((aio_result_t *)resultp)->aio_errno, error); 1718 } 1719 #ifdef _SYSCALL32_IMPL 1720 else { 1721 (void) suword32(&((aio_result32_t *)resultp)->aio_return, 1722 (uint_t)-1); 1723 (void) suword32(&((aio_result32_t *)resultp)->aio_errno, error); 1724 } 1725 #endif /* _SYSCALL32_IMPL */ 1726 } 1727 1728 /* 1729 * do cleanup completion for all requests in list. memory for 1730 * each request is also freed. 1731 */ 1732 static void 1733 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode) 1734 { 1735 int i; 1736 aio_req_t *reqp; 1737 aio_result_t *resultp; 1738 aiocb64_32_t *aiocb_64; 1739 1740 for (i = 0; i < nent; i++) { 1741 if (get_udatamodel() == DATAMODEL_NATIVE) { 1742 if (cbp[i] == NULL) 1743 continue; 1744 if (run_mode == AIO_LARGEFILE) { 1745 aiocb_64 = (aiocb64_32_t *)cbp[i]; 1746 resultp = (aio_result_t *) 1747 &aiocb_64->aio_resultp; 1748 } else 1749 resultp = &cbp[i]->aio_resultp; 1750 } 1751 #ifdef _SYSCALL32_IMPL 1752 else { 1753 aiocb32_t *aiocb_32; 1754 caddr32_t *cbp32; 1755 1756 cbp32 = (caddr32_t *)cbp; 1757 if (cbp32[i] == NULL) 1758 continue; 1759 if (run_mode == AIO_32) { 1760 aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i]; 1761 resultp = (aio_result_t *)&aiocb_32-> 1762 aio_resultp; 1763 } else if (run_mode == AIO_LARGEFILE) { 1764 aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i]; 1765 resultp = (aio_result_t *)&aiocb_64-> 1766 aio_resultp; 1767 } 1768 } 1769 #endif /* _SYSCALL32_IMPL */ 1770 /* 1771 * we need to get the aio_cleanupq_mutex since we call 1772 * aio_req_done(). 1773 */ 1774 mutex_enter(&aiop->aio_cleanupq_mutex); 1775 mutex_enter(&aiop->aio_mutex); 1776 reqp = aio_req_done(resultp); 1777 mutex_exit(&aiop->aio_mutex); 1778 mutex_exit(&aiop->aio_cleanupq_mutex); 1779 if (reqp != NULL) { 1780 aphysio_unlock(reqp); 1781 aio_copyout_result(reqp); 1782 mutex_enter(&aiop->aio_mutex); 1783 aio_req_free(aiop, reqp); 1784 mutex_exit(&aiop->aio_mutex); 1785 } 1786 } 1787 } 1788 1789 /* 1790 * Write out the results for an aio request that is done. 1791 */ 1792 static int 1793 aioerror(void *cb, int run_mode) 1794 { 1795 aio_result_t *resultp; 1796 aio_t *aiop; 1797 aio_req_t *reqp; 1798 int retval; 1799 1800 aiop = curproc->p_aio; 1801 if (aiop == NULL || cb == NULL) 1802 return (EINVAL); 1803 1804 if (get_udatamodel() == DATAMODEL_NATIVE) { 1805 if (run_mode == AIO_LARGEFILE) 1806 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)-> 1807 aio_resultp; 1808 else 1809 resultp = &((aiocb_t *)cb)->aio_resultp; 1810 } 1811 #ifdef _SYSCALL32_IMPL 1812 else { 1813 if (run_mode == AIO_LARGEFILE) 1814 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)-> 1815 aio_resultp; 1816 else if (run_mode == AIO_32) 1817 resultp = (aio_result_t *)&((aiocb32_t *)cb)-> 1818 aio_resultp; 1819 } 1820 #endif /* _SYSCALL32_IMPL */ 1821 /* 1822 * we need to get the aio_cleanupq_mutex since we call 1823 * aio_req_find(). 1824 */ 1825 mutex_enter(&aiop->aio_cleanupq_mutex); 1826 mutex_enter(&aiop->aio_mutex); 1827 retval = aio_req_find(resultp, &reqp); 1828 mutex_exit(&aiop->aio_mutex); 1829 mutex_exit(&aiop->aio_cleanupq_mutex); 1830 if (retval == 0) { 1831 aphysio_unlock(reqp); 1832 aio_copyout_result(reqp); 1833 mutex_enter(&aiop->aio_mutex); 1834 aio_req_free(aiop, reqp); 1835 mutex_exit(&aiop->aio_mutex); 1836 return (0); 1837 } else if (retval == 1) 1838 return (EINPROGRESS); 1839 else if (retval == 2) 1840 return (EINVAL); 1841 return (0); 1842 } 1843 1844 /* 1845 * aio_cancel - if no requests outstanding, 1846 * return AIO_ALLDONE 1847 * else 1848 * return AIO_NOTCANCELED 1849 */ 1850 static int 1851 aio_cancel( 1852 int fildes, 1853 void *cb, 1854 long *rval, 1855 int run_mode) 1856 { 1857 aio_t *aiop; 1858 void *resultp; 1859 int index; 1860 aio_req_t **bucket; 1861 aio_req_t *ent; 1862 1863 1864 /* 1865 * Verify valid file descriptor 1866 */ 1867 if ((getf(fildes)) == NULL) { 1868 return (EBADF); 1869 } 1870 releasef(fildes); 1871 1872 aiop = curproc->p_aio; 1873 if (aiop == NULL) 1874 return (EINVAL); 1875 1876 if (aiop->aio_outstanding == 0) { 1877 *rval = AIO_ALLDONE; 1878 return (0); 1879 } 1880 1881 mutex_enter(&aiop->aio_mutex); 1882 if (cb != NULL) { 1883 if (get_udatamodel() == DATAMODEL_NATIVE) { 1884 if (run_mode == AIO_LARGEFILE) 1885 resultp = (aio_result_t *)&((aiocb64_32_t *)cb) 1886 ->aio_resultp; 1887 else 1888 resultp = &((aiocb_t *)cb)->aio_resultp; 1889 } 1890 #ifdef _SYSCALL32_IMPL 1891 else { 1892 if (run_mode == AIO_LARGEFILE) 1893 resultp = (aio_result_t *)&((aiocb64_32_t *)cb) 1894 ->aio_resultp; 1895 else if (run_mode == AIO_32) 1896 resultp = (aio_result_t *)&((aiocb32_t *)cb) 1897 ->aio_resultp; 1898 } 1899 #endif /* _SYSCALL32_IMPL */ 1900 index = AIO_HASH(resultp); 1901 bucket = &aiop->aio_hash[index]; 1902 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 1903 if (ent->aio_req_resultp == resultp) { 1904 if ((ent->aio_req_flags & AIO_PENDING) == 0) { 1905 mutex_exit(&aiop->aio_mutex); 1906 *rval = AIO_ALLDONE; 1907 return (0); 1908 } 1909 mutex_exit(&aiop->aio_mutex); 1910 *rval = AIO_NOTCANCELED; 1911 return (0); 1912 } 1913 } 1914 mutex_exit(&aiop->aio_mutex); 1915 *rval = AIO_ALLDONE; 1916 return (0); 1917 } 1918 1919 for (index = 0; index < AIO_HASHSZ; index++) { 1920 bucket = &aiop->aio_hash[index]; 1921 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 1922 if (ent->aio_req_fd == fildes) { 1923 if ((ent->aio_req_flags & AIO_PENDING) != 0) { 1924 mutex_exit(&aiop->aio_mutex); 1925 *rval = AIO_NOTCANCELED; 1926 return (0); 1927 } 1928 } 1929 } 1930 } 1931 mutex_exit(&aiop->aio_mutex); 1932 *rval = AIO_ALLDONE; 1933 return (0); 1934 } 1935 1936 /* 1937 * solaris version of asynchronous read and write 1938 */ 1939 static int 1940 arw( 1941 int opcode, 1942 int fdes, 1943 char *bufp, 1944 int bufsize, 1945 offset_t offset, 1946 aio_result_t *resultp, 1947 int mode) 1948 { 1949 file_t *fp; 1950 int error; 1951 struct vnode *vp; 1952 aio_req_t *reqp; 1953 aio_t *aiop; 1954 int (*aio_func)(); 1955 #ifdef _LP64 1956 aiocb_t aiocb; 1957 #else 1958 aiocb64_32_t aiocb64; 1959 #endif 1960 1961 aiop = curproc->p_aio; 1962 if (aiop == NULL) 1963 return (EINVAL); 1964 1965 if ((fp = getf(fdes)) == NULL) { 1966 return (EBADF); 1967 } 1968 1969 /* 1970 * check the permission of the partition 1971 */ 1972 if ((fp->f_flag & mode) == 0) { 1973 releasef(fdes); 1974 return (EBADF); 1975 } 1976 1977 vp = fp->f_vnode; 1978 aio_func = check_vp(vp, mode); 1979 if (aio_func == NULL) { 1980 releasef(fdes); 1981 return (EBADFD); 1982 } 1983 #ifdef _LP64 1984 aiocb.aio_fildes = fdes; 1985 aiocb.aio_buf = bufp; 1986 aiocb.aio_nbytes = bufsize; 1987 aiocb.aio_offset = offset; 1988 aiocb.aio_sigevent.sigev_notify = 0; 1989 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1); 1990 #else 1991 aiocb64.aio_fildes = fdes; 1992 aiocb64.aio_buf = (caddr32_t)bufp; 1993 aiocb64.aio_nbytes = bufsize; 1994 aiocb64.aio_offset = offset; 1995 aiocb64.aio_sigevent.sigev_notify = 0; 1996 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1); 1997 #endif 1998 if (error) { 1999 releasef(fdes); 2000 return (error); 2001 } 2002 2003 /* 2004 * enable polling on this request if the opcode has 2005 * the AIO poll bit set 2006 */ 2007 if (opcode & AIO_POLL_BIT) 2008 reqp->aio_req_flags |= AIO_POLL; 2009 2010 if (bufsize == 0) { 2011 clear_active_fd(fdes); 2012 aio_zerolen(reqp); 2013 return (0); 2014 } 2015 /* 2016 * send the request to driver. 2017 */ 2018 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED()); 2019 /* 2020 * the fd is stored in the aio_req_t by aio_req_setup(), and 2021 * is released by the aio_cleanup_thread() when the IO has 2022 * completed. 2023 */ 2024 if (error) { 2025 releasef(fdes); 2026 mutex_enter(&aiop->aio_mutex); 2027 aio_req_free(aiop, reqp); 2028 aiop->aio_pending--; 2029 if (aiop->aio_flags & AIO_REQ_BLOCK) 2030 cv_signal(&aiop->aio_cleanupcv); 2031 mutex_exit(&aiop->aio_mutex); 2032 return (error); 2033 } 2034 clear_active_fd(fdes); 2035 return (0); 2036 } 2037 2038 /* 2039 * posix version of asynchronous read and write 2040 */ 2041 static int 2042 aiorw( 2043 int opcode, 2044 void *aiocb_arg, 2045 int mode, 2046 int run_mode) 2047 { 2048 #ifdef _SYSCALL32_IMPL 2049 aiocb32_t aiocb32; 2050 struct sigevent32 *sigev32; 2051 port_notify32_t pntfy32; 2052 #endif 2053 aiocb64_32_t aiocb64; 2054 aiocb_t aiocb; 2055 file_t *fp; 2056 int error, fd; 2057 size_t bufsize; 2058 struct vnode *vp; 2059 aio_req_t *reqp; 2060 aio_t *aiop; 2061 int (*aio_func)(); 2062 aio_result_t *resultp; 2063 struct sigevent *sigev; 2064 model_t model; 2065 int aio_use_port = 0; 2066 port_notify_t pntfy; 2067 2068 model = get_udatamodel(); 2069 aiop = curproc->p_aio; 2070 if (aiop == NULL) 2071 return (EINVAL); 2072 2073 if (model == DATAMODEL_NATIVE) { 2074 if (run_mode != AIO_LARGEFILE) { 2075 if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t))) 2076 return (EFAULT); 2077 bufsize = aiocb.aio_nbytes; 2078 resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp); 2079 if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) { 2080 return (EBADF); 2081 } 2082 sigev = &aiocb.aio_sigevent; 2083 } else { 2084 /* 2085 * We come here only when we make largefile 2086 * call on 32 bit kernel using 32 bit library. 2087 */ 2088 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t))) 2089 return (EFAULT); 2090 bufsize = aiocb64.aio_nbytes; 2091 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg) 2092 ->aio_resultp); 2093 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) 2094 return (EBADF); 2095 sigev = (struct sigevent *)&aiocb64.aio_sigevent; 2096 } 2097 2098 if (sigev->sigev_notify == SIGEV_PORT) { 2099 if (copyin((void *)sigev->sigev_value.sival_ptr, 2100 &pntfy, sizeof (port_notify_t))) { 2101 releasef(fd); 2102 return (EFAULT); 2103 } 2104 aio_use_port = 1; 2105 } else if (sigev->sigev_notify == SIGEV_THREAD) { 2106 pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo; 2107 pntfy.portnfy_user = 2108 aiocb.aio_sigevent.sigev_value.sival_ptr; 2109 aio_use_port = 1; 2110 } 2111 } 2112 #ifdef _SYSCALL32_IMPL 2113 else { 2114 if (run_mode == AIO_32) { 2115 /* 32 bit system call is being made on 64 bit kernel */ 2116 if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t))) 2117 return (EFAULT); 2118 2119 bufsize = aiocb32.aio_nbytes; 2120 aiocb_32ton(&aiocb32, &aiocb); 2121 resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)-> 2122 aio_resultp); 2123 if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) { 2124 return (EBADF); 2125 } 2126 sigev32 = &aiocb32.aio_sigevent; 2127 } else if (run_mode == AIO_LARGEFILE) { 2128 /* 2129 * We come here only when we make largefile 2130 * call on 64 bit kernel using 32 bit library. 2131 */ 2132 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t))) 2133 return (EFAULT); 2134 bufsize = aiocb64.aio_nbytes; 2135 aiocb_LFton(&aiocb64, &aiocb); 2136 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg) 2137 ->aio_resultp); 2138 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) 2139 return (EBADF); 2140 sigev32 = &aiocb64.aio_sigevent; 2141 } 2142 2143 if (sigev32->sigev_notify == SIGEV_PORT) { 2144 if (copyin( 2145 (void *)(uintptr_t)sigev32->sigev_value.sival_ptr, 2146 &pntfy32, sizeof (port_notify32_t))) { 2147 releasef(fd); 2148 return (EFAULT); 2149 } 2150 pntfy.portnfy_port = pntfy32.portnfy_port; 2151 pntfy.portnfy_user = (void *)(uintptr_t) 2152 pntfy32.portnfy_user; 2153 aio_use_port = 1; 2154 } else if (sigev32->sigev_notify == SIGEV_THREAD) { 2155 pntfy.portnfy_port = sigev32->sigev_signo; 2156 pntfy.portnfy_user = (void *)(uintptr_t) 2157 sigev32->sigev_value.sival_ptr; 2158 aio_use_port = 1; 2159 } 2160 } 2161 #endif /* _SYSCALL32_IMPL */ 2162 2163 /* 2164 * check the permission of the partition 2165 */ 2166 2167 if ((fp->f_flag & mode) == 0) { 2168 releasef(fd); 2169 return (EBADF); 2170 } 2171 2172 vp = fp->f_vnode; 2173 aio_func = check_vp(vp, mode); 2174 if (aio_func == NULL) { 2175 releasef(fd); 2176 return (EBADFD); 2177 } 2178 if (run_mode == AIO_LARGEFILE) 2179 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0); 2180 else 2181 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0); 2182 2183 if (error) { 2184 releasef(fd); 2185 return (error); 2186 } 2187 /* 2188 * enable polling on this request if the opcode has 2189 * the AIO poll bit set 2190 */ 2191 if (opcode & AIO_POLL_BIT) 2192 reqp->aio_req_flags |= AIO_POLL; 2193 2194 if (model == DATAMODEL_NATIVE) 2195 reqp->aio_req_iocb.iocb = aiocb_arg; 2196 #ifdef _SYSCALL32_IMPL 2197 else 2198 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg; 2199 #endif 2200 2201 if (aio_use_port) { 2202 int event = (run_mode == AIO_LARGEFILE)? 2203 ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) : 2204 ((mode == FREAD)? AIOAREAD : AIOAWRITE); 2205 error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event); 2206 } 2207 2208 /* 2209 * send the request to driver. 2210 */ 2211 if (error == 0) { 2212 if (bufsize == 0) { 2213 clear_active_fd(fd); 2214 aio_zerolen(reqp); 2215 return (0); 2216 } 2217 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED()); 2218 } 2219 2220 /* 2221 * the fd is stored in the aio_req_t by aio_req_setup(), and 2222 * is released by the aio_cleanup_thread() when the IO has 2223 * completed. 2224 */ 2225 if (error) { 2226 releasef(fd); 2227 mutex_enter(&aiop->aio_mutex); 2228 if (aio_use_port) 2229 aio_deq(&aiop->aio_portpending, reqp); 2230 aio_req_free(aiop, reqp); 2231 aiop->aio_pending--; 2232 if (aiop->aio_flags & AIO_REQ_BLOCK) 2233 cv_signal(&aiop->aio_cleanupcv); 2234 mutex_exit(&aiop->aio_mutex); 2235 return (error); 2236 } 2237 clear_active_fd(fd); 2238 return (0); 2239 } 2240 2241 2242 /* 2243 * set error for a list IO entry that failed. 2244 */ 2245 static void 2246 lio_set_error(aio_req_t *reqp, int portused) 2247 { 2248 aio_t *aiop = curproc->p_aio; 2249 2250 if (aiop == NULL) 2251 return; 2252 2253 mutex_enter(&aiop->aio_mutex); 2254 if (portused) 2255 aio_deq(&aiop->aio_portpending, reqp); 2256 aiop->aio_pending--; 2257 /* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */ 2258 reqp->aio_req_flags |= AIO_PHYSIODONE; 2259 /* 2260 * Need to free the request now as its never 2261 * going to get on the done queue 2262 * 2263 * Note: aio_outstanding is decremented in 2264 * aio_req_free() 2265 */ 2266 aio_req_free(aiop, reqp); 2267 if (aiop->aio_flags & AIO_REQ_BLOCK) 2268 cv_signal(&aiop->aio_cleanupcv); 2269 mutex_exit(&aiop->aio_mutex); 2270 } 2271 2272 /* 2273 * check if a specified request is done, and remove it from 2274 * the done queue. otherwise remove anybody from the done queue 2275 * if NULL is specified. 2276 */ 2277 static aio_req_t * 2278 aio_req_done(void *resultp) 2279 { 2280 aio_req_t **bucket; 2281 aio_req_t *ent; 2282 aio_t *aiop = curproc->p_aio; 2283 long index; 2284 2285 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex)); 2286 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2287 2288 if (resultp) { 2289 index = AIO_HASH(resultp); 2290 bucket = &aiop->aio_hash[index]; 2291 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 2292 if (ent->aio_req_resultp == (aio_result_t *)resultp) { 2293 if (ent->aio_req_flags & AIO_DONEQ) { 2294 return (aio_req_remove(ent)); 2295 } 2296 return (NULL); 2297 } 2298 } 2299 /* no match, resultp is invalid */ 2300 return (NULL); 2301 } 2302 return (aio_req_remove(NULL)); 2303 } 2304 2305 /* 2306 * determine if a user-level resultp pointer is associated with an 2307 * active IO request. Zero is returned when the request is done, 2308 * and the request is removed from the done queue. Only when the 2309 * return value is zero, is the "reqp" pointer valid. One is returned 2310 * when the request is inprogress. Two is returned when the request 2311 * is invalid. 2312 */ 2313 static int 2314 aio_req_find(aio_result_t *resultp, aio_req_t **reqp) 2315 { 2316 aio_req_t **bucket; 2317 aio_req_t *ent; 2318 aio_t *aiop = curproc->p_aio; 2319 long index; 2320 2321 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex)); 2322 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2323 2324 index = AIO_HASH(resultp); 2325 bucket = &aiop->aio_hash[index]; 2326 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 2327 if (ent->aio_req_resultp == resultp) { 2328 if (ent->aio_req_flags & AIO_DONEQ) { 2329 *reqp = aio_req_remove(ent); 2330 return (0); 2331 } 2332 return (1); 2333 } 2334 } 2335 /* no match, resultp is invalid */ 2336 return (2); 2337 } 2338 2339 /* 2340 * remove a request from the done queue. 2341 */ 2342 static aio_req_t * 2343 aio_req_remove(aio_req_t *reqp) 2344 { 2345 aio_t *aiop = curproc->p_aio; 2346 2347 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2348 2349 if (reqp != NULL) { 2350 ASSERT(reqp->aio_req_flags & AIO_DONEQ); 2351 if (reqp->aio_req_next == reqp) { 2352 /* only one request on queue */ 2353 if (reqp == aiop->aio_doneq) { 2354 aiop->aio_doneq = NULL; 2355 } else { 2356 ASSERT(reqp == aiop->aio_cleanupq); 2357 aiop->aio_cleanupq = NULL; 2358 } 2359 } else { 2360 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev; 2361 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next; 2362 /* 2363 * The request can be either on the aio_doneq or the 2364 * aio_cleanupq 2365 */ 2366 if (reqp == aiop->aio_doneq) 2367 aiop->aio_doneq = reqp->aio_req_next; 2368 2369 if (reqp == aiop->aio_cleanupq) 2370 aiop->aio_cleanupq = reqp->aio_req_next; 2371 } 2372 reqp->aio_req_flags &= ~AIO_DONEQ; 2373 reqp->aio_req_next = NULL; 2374 reqp->aio_req_prev = NULL; 2375 } else if ((reqp = aiop->aio_doneq) != NULL) { 2376 ASSERT(reqp->aio_req_flags & AIO_DONEQ); 2377 if (reqp == reqp->aio_req_next) { 2378 /* only one request on queue */ 2379 aiop->aio_doneq = NULL; 2380 } else { 2381 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next; 2382 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev; 2383 aiop->aio_doneq = reqp->aio_req_next; 2384 } 2385 reqp->aio_req_flags &= ~AIO_DONEQ; 2386 reqp->aio_req_next = NULL; 2387 reqp->aio_req_prev = NULL; 2388 } 2389 if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN)) 2390 cv_broadcast(&aiop->aio_waitcv); 2391 return (reqp); 2392 } 2393 2394 static int 2395 aio_req_setup( 2396 aio_req_t **reqpp, 2397 aio_t *aiop, 2398 aiocb_t *arg, 2399 aio_result_t *resultp, 2400 vnode_t *vp, 2401 int old_solaris_req) 2402 { 2403 sigqueue_t *sqp = NULL; 2404 aio_req_t *reqp; 2405 struct uio *uio; 2406 struct sigevent *sigev; 2407 int error; 2408 2409 sigev = &arg->aio_sigevent; 2410 if (sigev->sigev_notify == SIGEV_SIGNAL && 2411 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) { 2412 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 2413 if (sqp == NULL) 2414 return (EAGAIN); 2415 sqp->sq_func = NULL; 2416 sqp->sq_next = NULL; 2417 sqp->sq_info.si_code = SI_ASYNCIO; 2418 sqp->sq_info.si_pid = curproc->p_pid; 2419 sqp->sq_info.si_ctid = PRCTID(curproc); 2420 sqp->sq_info.si_zoneid = getzoneid(); 2421 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 2422 sqp->sq_info.si_signo = sigev->sigev_signo; 2423 sqp->sq_info.si_value = sigev->sigev_value; 2424 } 2425 2426 mutex_enter(&aiop->aio_mutex); 2427 2428 if (aiop->aio_flags & AIO_REQ_BLOCK) { 2429 mutex_exit(&aiop->aio_mutex); 2430 if (sqp) 2431 kmem_free(sqp, sizeof (sigqueue_t)); 2432 return (EIO); 2433 } 2434 /* 2435 * get an aio_reqp from the free list or allocate one 2436 * from dynamic memory. 2437 */ 2438 if (error = aio_req_alloc(&reqp, resultp)) { 2439 mutex_exit(&aiop->aio_mutex); 2440 if (sqp) 2441 kmem_free(sqp, sizeof (sigqueue_t)); 2442 return (error); 2443 } 2444 aiop->aio_pending++; 2445 aiop->aio_outstanding++; 2446 reqp->aio_req_flags = AIO_PENDING; 2447 if (old_solaris_req) { 2448 /* this is an old solaris aio request */ 2449 reqp->aio_req_flags |= AIO_SOLARIS; 2450 aiop->aio_flags |= AIO_SOLARIS_REQ; 2451 } 2452 if (sigev->sigev_notify == SIGEV_THREAD || 2453 sigev->sigev_notify == SIGEV_PORT) 2454 aio_enq(&aiop->aio_portpending, reqp, 0); 2455 mutex_exit(&aiop->aio_mutex); 2456 /* 2457 * initialize aio request. 2458 */ 2459 reqp->aio_req_fd = arg->aio_fildes; 2460 reqp->aio_req_sigqp = sqp; 2461 reqp->aio_req_iocb.iocb = NULL; 2462 reqp->aio_req_lio = NULL; 2463 reqp->aio_req_buf.b_file = vp; 2464 uio = reqp->aio_req.aio_uio; 2465 uio->uio_iovcnt = 1; 2466 uio->uio_iov->iov_base = (caddr_t)arg->aio_buf; 2467 uio->uio_iov->iov_len = arg->aio_nbytes; 2468 uio->uio_loffset = arg->aio_offset; 2469 *reqpp = reqp; 2470 return (0); 2471 } 2472 2473 /* 2474 * Allocate p_aio struct. 2475 */ 2476 static aio_t * 2477 aio_aiop_alloc(void) 2478 { 2479 aio_t *aiop; 2480 2481 ASSERT(MUTEX_HELD(&curproc->p_lock)); 2482 2483 aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP); 2484 if (aiop) { 2485 mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL); 2486 mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT, 2487 NULL); 2488 mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL); 2489 } 2490 return (aiop); 2491 } 2492 2493 /* 2494 * Allocate an aio_req struct. 2495 */ 2496 static int 2497 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp) 2498 { 2499 aio_req_t *reqp; 2500 aio_t *aiop = curproc->p_aio; 2501 2502 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2503 2504 if ((reqp = aiop->aio_free) != NULL) { 2505 aiop->aio_free = reqp->aio_req_next; 2506 bzero(reqp, sizeof (*reqp)); 2507 } else { 2508 /* 2509 * Check whether memory is getting tight. 2510 * This is a temporary mechanism to avoid memory 2511 * exhaustion by a single process until we come up 2512 * with a per process solution such as setrlimit(). 2513 */ 2514 if (freemem < desfree) 2515 return (EAGAIN); 2516 reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP); 2517 if (reqp == NULL) 2518 return (EAGAIN); 2519 } 2520 reqp->aio_req.aio_uio = &reqp->aio_req_uio; 2521 reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov; 2522 reqp->aio_req.aio_private = reqp; 2523 reqp->aio_req_buf.b_offset = -1; 2524 reqp->aio_req_resultp = resultp; 2525 if (aio_hash_insert(reqp, aiop)) { 2526 reqp->aio_req_next = aiop->aio_free; 2527 aiop->aio_free = reqp; 2528 return (EBUSY); 2529 } 2530 *nreqp = reqp; 2531 return (0); 2532 } 2533 2534 /* 2535 * Allocate an aio_lio_t struct. 2536 */ 2537 static int 2538 aio_lio_alloc(aio_lio_t **head) 2539 { 2540 aio_lio_t *liop; 2541 aio_t *aiop = curproc->p_aio; 2542 2543 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2544 2545 if ((liop = aiop->aio_lio_free) != NULL) { 2546 aiop->aio_lio_free = liop->lio_next; 2547 } else { 2548 /* 2549 * Check whether memory is getting tight. 2550 * This is a temporary mechanism to avoid memory 2551 * exhaustion by a single process until we come up 2552 * with a per process solution such as setrlimit(). 2553 */ 2554 if (freemem < desfree) 2555 return (EAGAIN); 2556 2557 liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP); 2558 if (liop == NULL) 2559 return (EAGAIN); 2560 } 2561 *head = liop; 2562 return (0); 2563 } 2564 2565 /* 2566 * this is a special per-process thread that is only activated if 2567 * the process is unmapping a segment with outstanding aio. normally, 2568 * the process will have completed the aio before unmapping the 2569 * segment. If the process does unmap a segment with outstanding aio, 2570 * this special thread will guarentee that the locked pages due to 2571 * aphysio() are released, thereby permitting the segment to be 2572 * unmapped. In addition to this, the cleanup thread is woken up 2573 * during DR operations to release the locked pages. 2574 */ 2575 2576 static int 2577 aio_cleanup_thread(aio_t *aiop) 2578 { 2579 proc_t *p = curproc; 2580 struct as *as = p->p_as; 2581 int poked = 0; 2582 kcondvar_t *cvp; 2583 int exit_flag = 0; 2584 int rqclnup = 0; 2585 2586 sigfillset(&curthread->t_hold); 2587 sigdiffset(&curthread->t_hold, &cantmask); 2588 for (;;) { 2589 /* 2590 * if a segment is being unmapped, and the current 2591 * process's done queue is not empty, then every request 2592 * on the doneq with locked resources should be forced 2593 * to release their locks. By moving the doneq request 2594 * to the cleanupq, aio_cleanup() will process the cleanupq, 2595 * and place requests back onto the doneq. All requests 2596 * processed by aio_cleanup() will have their physical 2597 * resources unlocked. 2598 */ 2599 mutex_enter(&aiop->aio_mutex); 2600 if ((aiop->aio_flags & AIO_CLEANUP) == 0) { 2601 aiop->aio_flags |= AIO_CLEANUP; 2602 mutex_enter(&as->a_contents); 2603 if (aiop->aio_rqclnup) { 2604 aiop->aio_rqclnup = 0; 2605 rqclnup = 1; 2606 } 2607 mutex_exit(&as->a_contents); 2608 if (aiop->aio_doneq) { 2609 aio_req_t *doneqhead = aiop->aio_doneq; 2610 aiop->aio_doneq = NULL; 2611 aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ); 2612 } 2613 } 2614 mutex_exit(&aiop->aio_mutex); 2615 aio_cleanup(AIO_CLEANUP_THREAD); 2616 /* 2617 * thread should block on the cleanupcv while 2618 * AIO_CLEANUP is set. 2619 */ 2620 cvp = &aiop->aio_cleanupcv; 2621 mutex_enter(&aiop->aio_mutex); 2622 2623 if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL || 2624 aiop->aio_notifyq != NULL || 2625 aiop->aio_portcleanupq != NULL) { 2626 mutex_exit(&aiop->aio_mutex); 2627 continue; 2628 } 2629 mutex_enter(&as->a_contents); 2630 2631 /* 2632 * AIO_CLEANUP determines when the cleanup thread 2633 * should be active. This flag is set when 2634 * the cleanup thread is awakened by as_unmap() or 2635 * due to DR operations. 2636 * The flag is cleared when the blocking as_unmap() 2637 * that originally awakened us is allowed to 2638 * complete. as_unmap() blocks when trying to 2639 * unmap a segment that has SOFTLOCKed pages. when 2640 * the segment's pages are all SOFTUNLOCKed, 2641 * as->a_flags & AS_UNMAPWAIT should be zero. 2642 * 2643 * In case of cleanup request by DR, the flag is cleared 2644 * once all the pending aio requests have been processed. 2645 * 2646 * The flag shouldn't be cleared right away if the 2647 * cleanup thread was interrupted because the process 2648 * is doing forkall(). This happens when cv_wait_sig() 2649 * returns zero, because it was awakened by a pokelwps(). 2650 * If the process is not exiting, it must be doing forkall(). 2651 */ 2652 if ((poked == 0) && 2653 ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) || 2654 (aiop->aio_pending == 0))) { 2655 aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT); 2656 cvp = &as->a_cv; 2657 rqclnup = 0; 2658 } 2659 mutex_exit(&aiop->aio_mutex); 2660 if (poked) { 2661 /* 2662 * If the process is exiting/killed, don't return 2663 * immediately without waiting for pending I/O's 2664 * and releasing the page locks. 2665 */ 2666 if (p->p_flag & (SEXITLWPS|SKILLED)) { 2667 /* 2668 * If exit_flag is set, then it is 2669 * safe to exit because we have released 2670 * page locks of completed I/O's. 2671 */ 2672 if (exit_flag) 2673 break; 2674 2675 mutex_exit(&as->a_contents); 2676 2677 /* 2678 * Wait for all the pending aio to complete. 2679 */ 2680 mutex_enter(&aiop->aio_mutex); 2681 aiop->aio_flags |= AIO_REQ_BLOCK; 2682 while (aiop->aio_pending != 0) 2683 cv_wait(&aiop->aio_cleanupcv, 2684 &aiop->aio_mutex); 2685 mutex_exit(&aiop->aio_mutex); 2686 exit_flag = 1; 2687 continue; 2688 } else if (p->p_flag & 2689 (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) { 2690 /* 2691 * hold LWP until it 2692 * is continued. 2693 */ 2694 mutex_exit(&as->a_contents); 2695 mutex_enter(&p->p_lock); 2696 stop(PR_SUSPENDED, SUSPEND_NORMAL); 2697 mutex_exit(&p->p_lock); 2698 poked = 0; 2699 continue; 2700 } 2701 } else { 2702 /* 2703 * When started this thread will sleep on as->a_cv. 2704 * as_unmap will awake this thread if the 2705 * segment has SOFTLOCKed pages (poked = 0). 2706 * 1. pokelwps() awakes this thread => 2707 * break the loop to check SEXITLWPS, SHOLDFORK, etc 2708 * 2. as_unmap awakes this thread => 2709 * to break the loop it is necessary that 2710 * - AS_UNMAPWAIT is set (as_unmap is waiting for 2711 * memory to be unlocked) 2712 * - AIO_CLEANUP is not set 2713 * (if AIO_CLEANUP is set we have to wait for 2714 * pending requests. aio_done will send a signal 2715 * for every request which completes to continue 2716 * unmapping the corresponding address range) 2717 * 3. A cleanup request will wake this thread up, ex. 2718 * by the DR operations. The aio_rqclnup flag will 2719 * be set. 2720 */ 2721 while (poked == 0) { 2722 /* 2723 * The clean up requests that came in 2724 * after we had just cleaned up, couldn't 2725 * be causing the unmap thread to block - as 2726 * unmap event happened first. 2727 * Let aio_done() wake us up if it sees a need. 2728 */ 2729 if (aiop->aio_rqclnup && 2730 (aiop->aio_flags & AIO_CLEANUP) == 0) 2731 break; 2732 poked = !cv_wait_sig(cvp, &as->a_contents); 2733 if (AS_ISUNMAPWAIT(as) == 0) 2734 cv_signal(cvp); 2735 if (aiop->aio_outstanding != 0) 2736 break; 2737 } 2738 } 2739 mutex_exit(&as->a_contents); 2740 } 2741 exit: 2742 mutex_exit(&as->a_contents); 2743 ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED))); 2744 aston(curthread); /* make thread do post_syscall */ 2745 return (0); 2746 } 2747 2748 /* 2749 * save a reference to a user's outstanding aio in a hash list. 2750 */ 2751 static int 2752 aio_hash_insert( 2753 aio_req_t *aio_reqp, 2754 aio_t *aiop) 2755 { 2756 long index; 2757 aio_result_t *resultp = aio_reqp->aio_req_resultp; 2758 aio_req_t *current; 2759 aio_req_t **nextp; 2760 2761 index = AIO_HASH(resultp); 2762 nextp = &aiop->aio_hash[index]; 2763 while ((current = *nextp) != NULL) { 2764 if (current->aio_req_resultp == resultp) 2765 return (DUPLICATE); 2766 nextp = ¤t->aio_hash_next; 2767 } 2768 *nextp = aio_reqp; 2769 aio_reqp->aio_hash_next = NULL; 2770 return (0); 2771 } 2772 2773 static int 2774 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *, 2775 cred_t *) 2776 { 2777 struct snode *sp; 2778 dev_t dev; 2779 struct cb_ops *cb; 2780 major_t major; 2781 int (*aio_func)(); 2782 2783 dev = vp->v_rdev; 2784 major = getmajor(dev); 2785 2786 /* 2787 * return NULL for requests to files and STREAMs so 2788 * that libaio takes care of them. 2789 */ 2790 if (vp->v_type == VCHR) { 2791 /* no stream device for kaio */ 2792 if (STREAMSTAB(major)) { 2793 return (NULL); 2794 } 2795 } else { 2796 return (NULL); 2797 } 2798 2799 /* 2800 * Check old drivers which do not have async I/O entry points. 2801 */ 2802 if (devopsp[major]->devo_rev < 3) 2803 return (NULL); 2804 2805 cb = devopsp[major]->devo_cb_ops; 2806 2807 if (cb->cb_rev < 1) 2808 return (NULL); 2809 2810 /* 2811 * Check whether this device is a block device. 2812 * Kaio is not supported for devices like tty. 2813 */ 2814 if (cb->cb_strategy == nodev || cb->cb_strategy == NULL) 2815 return (NULL); 2816 2817 /* 2818 * Clustering: If vnode is a PXFS vnode, then the device may be remote. 2819 * We cannot call the driver directly. Instead return the 2820 * PXFS functions. 2821 */ 2822 2823 if (IS_PXFSVP(vp)) { 2824 if (mode & FREAD) 2825 return (clpxfs_aio_read); 2826 else 2827 return (clpxfs_aio_write); 2828 } 2829 if (mode & FREAD) 2830 aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read; 2831 else 2832 aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write; 2833 2834 /* 2835 * Do we need this ? 2836 * nodev returns ENXIO anyway. 2837 */ 2838 if (aio_func == nodev) 2839 return (NULL); 2840 2841 sp = VTOS(vp); 2842 smark(sp, SACC); 2843 return (aio_func); 2844 } 2845 2846 /* 2847 * Clustering: We want check_vp to return a function prototyped 2848 * correctly that will be common to both PXFS and regular case. 2849 * We define this intermediate function that will do the right 2850 * thing for driver cases. 2851 */ 2852 2853 static int 2854 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p) 2855 { 2856 dev_t dev; 2857 struct cb_ops *cb; 2858 2859 ASSERT(vp->v_type == VCHR); 2860 ASSERT(!IS_PXFSVP(vp)); 2861 dev = VTOS(vp)->s_dev; 2862 ASSERT(STREAMSTAB(getmajor(dev)) == NULL); 2863 2864 cb = devopsp[getmajor(dev)]->devo_cb_ops; 2865 2866 ASSERT(cb->cb_awrite != nodev); 2867 return ((*cb->cb_awrite)(dev, aio, cred_p)); 2868 } 2869 2870 /* 2871 * Clustering: We want check_vp to return a function prototyped 2872 * correctly that will be common to both PXFS and regular case. 2873 * We define this intermediate function that will do the right 2874 * thing for driver cases. 2875 */ 2876 2877 static int 2878 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p) 2879 { 2880 dev_t dev; 2881 struct cb_ops *cb; 2882 2883 ASSERT(vp->v_type == VCHR); 2884 ASSERT(!IS_PXFSVP(vp)); 2885 dev = VTOS(vp)->s_dev; 2886 ASSERT(!STREAMSTAB(getmajor(dev))); 2887 2888 cb = devopsp[getmajor(dev)]->devo_cb_ops; 2889 2890 ASSERT(cb->cb_aread != nodev); 2891 return ((*cb->cb_aread)(dev, aio, cred_p)); 2892 } 2893 2894 /* 2895 * This routine is called when a largefile call is made by a 32bit 2896 * process on a ILP32 or LP64 kernel. All 64bit processes are large 2897 * file by definition and will call alio() instead. 2898 */ 2899 static int 2900 alioLF( 2901 int mode_arg, 2902 void *aiocb_arg, 2903 int nent, 2904 void *sigev) 2905 { 2906 file_t *fp; 2907 file_t *prev_fp = NULL; 2908 int prev_mode = -1; 2909 struct vnode *vp; 2910 aio_lio_t *head; 2911 aio_req_t *reqp; 2912 aio_t *aiop; 2913 caddr_t cbplist; 2914 aiocb64_32_t cb64; 2915 aiocb64_32_t *aiocb = &cb64; 2916 aiocb64_32_t *cbp; 2917 caddr32_t *ucbp; 2918 #ifdef _LP64 2919 aiocb_t aiocb_n; 2920 #endif 2921 struct sigevent32 sigevk; 2922 sigqueue_t *sqp; 2923 int (*aio_func)(); 2924 int mode; 2925 int error = 0; 2926 int aio_errors = 0; 2927 int i; 2928 size_t ssize; 2929 int deadhead = 0; 2930 int aio_notsupported = 0; 2931 int lio_head_port; 2932 int aio_port; 2933 int aio_thread; 2934 port_kevent_t *pkevtp = NULL; 2935 int portused = 0; 2936 port_notify32_t pnotify; 2937 int event; 2938 2939 aiop = curproc->p_aio; 2940 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 2941 return (EINVAL); 2942 2943 ASSERT(get_udatamodel() == DATAMODEL_ILP32); 2944 2945 ssize = (sizeof (caddr32_t) * nent); 2946 cbplist = kmem_alloc(ssize, KM_SLEEP); 2947 ucbp = (caddr32_t *)cbplist; 2948 2949 if (copyin(aiocb_arg, cbplist, ssize) || 2950 (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) { 2951 kmem_free(cbplist, ssize); 2952 return (EFAULT); 2953 } 2954 2955 /* Event Ports */ 2956 if (sigev && 2957 (sigevk.sigev_notify == SIGEV_THREAD || 2958 sigevk.sigev_notify == SIGEV_PORT)) { 2959 if (sigevk.sigev_notify == SIGEV_THREAD) { 2960 pnotify.portnfy_port = sigevk.sigev_signo; 2961 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 2962 } else if (copyin( 2963 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr, 2964 &pnotify, sizeof (pnotify))) { 2965 kmem_free(cbplist, ssize); 2966 return (EFAULT); 2967 } 2968 error = port_alloc_event(pnotify.portnfy_port, 2969 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 2970 if (error) { 2971 if (error == ENOMEM || error == EAGAIN) 2972 error = EAGAIN; 2973 else 2974 error = EINVAL; 2975 kmem_free(cbplist, ssize); 2976 return (error); 2977 } 2978 lio_head_port = pnotify.portnfy_port; 2979 portused = 1; 2980 } 2981 2982 /* 2983 * a list head should be allocated if notification is 2984 * enabled for this list. 2985 */ 2986 head = NULL; 2987 2988 if (mode_arg == LIO_WAIT || sigev) { 2989 mutex_enter(&aiop->aio_mutex); 2990 error = aio_lio_alloc(&head); 2991 mutex_exit(&aiop->aio_mutex); 2992 if (error) 2993 goto done; 2994 deadhead = 1; 2995 head->lio_nent = nent; 2996 head->lio_refcnt = nent; 2997 head->lio_port = -1; 2998 head->lio_portkev = NULL; 2999 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 3000 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 3001 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3002 if (sqp == NULL) { 3003 error = EAGAIN; 3004 goto done; 3005 } 3006 sqp->sq_func = NULL; 3007 sqp->sq_next = NULL; 3008 sqp->sq_info.si_code = SI_ASYNCIO; 3009 sqp->sq_info.si_pid = curproc->p_pid; 3010 sqp->sq_info.si_ctid = PRCTID(curproc); 3011 sqp->sq_info.si_zoneid = getzoneid(); 3012 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3013 sqp->sq_info.si_signo = sigevk.sigev_signo; 3014 sqp->sq_info.si_value.sival_int = 3015 sigevk.sigev_value.sival_int; 3016 head->lio_sigqp = sqp; 3017 } else { 3018 head->lio_sigqp = NULL; 3019 } 3020 if (pkevtp) { 3021 /* 3022 * Prepare data to send when list of aiocb's 3023 * has completed. 3024 */ 3025 port_init_event(pkevtp, (uintptr_t)sigev, 3026 (void *)(uintptr_t)pnotify.portnfy_user, 3027 NULL, head); 3028 pkevtp->portkev_events = AIOLIO64; 3029 head->lio_portkev = pkevtp; 3030 head->lio_port = pnotify.portnfy_port; 3031 } 3032 } 3033 3034 for (i = 0; i < nent; i++, ucbp++) { 3035 3036 cbp = (aiocb64_32_t *)(uintptr_t)*ucbp; 3037 /* skip entry if it can't be copied. */ 3038 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) { 3039 if (head) { 3040 mutex_enter(&aiop->aio_mutex); 3041 head->lio_nent--; 3042 head->lio_refcnt--; 3043 mutex_exit(&aiop->aio_mutex); 3044 } 3045 continue; 3046 } 3047 3048 /* skip if opcode for aiocb is LIO_NOP */ 3049 mode = aiocb->aio_lio_opcode; 3050 if (mode == LIO_NOP) { 3051 cbp = NULL; 3052 if (head) { 3053 mutex_enter(&aiop->aio_mutex); 3054 head->lio_nent--; 3055 head->lio_refcnt--; 3056 mutex_exit(&aiop->aio_mutex); 3057 } 3058 continue; 3059 } 3060 3061 /* increment file descriptor's ref count. */ 3062 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 3063 lio_set_uerror(&cbp->aio_resultp, EBADF); 3064 if (head) { 3065 mutex_enter(&aiop->aio_mutex); 3066 head->lio_nent--; 3067 head->lio_refcnt--; 3068 mutex_exit(&aiop->aio_mutex); 3069 } 3070 aio_errors++; 3071 continue; 3072 } 3073 3074 /* 3075 * check the permission of the partition 3076 */ 3077 if ((fp->f_flag & mode) == 0) { 3078 releasef(aiocb->aio_fildes); 3079 lio_set_uerror(&cbp->aio_resultp, EBADF); 3080 if (head) { 3081 mutex_enter(&aiop->aio_mutex); 3082 head->lio_nent--; 3083 head->lio_refcnt--; 3084 mutex_exit(&aiop->aio_mutex); 3085 } 3086 aio_errors++; 3087 continue; 3088 } 3089 3090 /* 3091 * common case where requests are to the same fd 3092 * for the same r/w operation 3093 * for UFS, need to set EBADFD 3094 */ 3095 vp = fp->f_vnode; 3096 if (fp != prev_fp || mode != prev_mode) { 3097 aio_func = check_vp(vp, mode); 3098 if (aio_func == NULL) { 3099 prev_fp = NULL; 3100 releasef(aiocb->aio_fildes); 3101 lio_set_uerror(&cbp->aio_resultp, EBADFD); 3102 aio_notsupported++; 3103 if (head) { 3104 mutex_enter(&aiop->aio_mutex); 3105 head->lio_nent--; 3106 head->lio_refcnt--; 3107 mutex_exit(&aiop->aio_mutex); 3108 } 3109 continue; 3110 } else { 3111 prev_fp = fp; 3112 prev_mode = mode; 3113 } 3114 } 3115 3116 #ifdef _LP64 3117 aiocb_LFton(aiocb, &aiocb_n); 3118 error = aio_req_setup(&reqp, aiop, &aiocb_n, 3119 (aio_result_t *)&cbp->aio_resultp, vp, 0); 3120 #else 3121 error = aio_req_setupLF(&reqp, aiop, aiocb, 3122 (aio_result_t *)&cbp->aio_resultp, vp, 0); 3123 #endif /* _LP64 */ 3124 if (error) { 3125 releasef(aiocb->aio_fildes); 3126 lio_set_uerror(&cbp->aio_resultp, error); 3127 if (head) { 3128 mutex_enter(&aiop->aio_mutex); 3129 head->lio_nent--; 3130 head->lio_refcnt--; 3131 mutex_exit(&aiop->aio_mutex); 3132 } 3133 aio_errors++; 3134 continue; 3135 } 3136 3137 reqp->aio_req_lio = head; 3138 deadhead = 0; 3139 3140 /* 3141 * Set the errno field now before sending the request to 3142 * the driver to avoid a race condition 3143 */ 3144 (void) suword32(&cbp->aio_resultp.aio_errno, 3145 EINPROGRESS); 3146 3147 reqp->aio_req_iocb.iocb32 = *ucbp; 3148 3149 event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64; 3150 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 3151 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 3152 if (aio_port | aio_thread) { 3153 port_kevent_t *lpkevp; 3154 /* 3155 * Prepare data to send with each aiocb completed. 3156 */ 3157 if (aio_port) { 3158 void *paddr = (void *)(uintptr_t) 3159 aiocb->aio_sigevent.sigev_value.sival_ptr; 3160 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3161 error = EFAULT; 3162 } else { /* aio_thread */ 3163 pnotify.portnfy_port = 3164 aiocb->aio_sigevent.sigev_signo; 3165 pnotify.portnfy_user = 3166 aiocb->aio_sigevent.sigev_value.sival_ptr; 3167 } 3168 if (error) 3169 /* EMPTY */; 3170 else if (pkevtp != NULL && 3171 pnotify.portnfy_port == lio_head_port) 3172 error = port_dup_event(pkevtp, &lpkevp, 3173 PORT_ALLOC_DEFAULT); 3174 else 3175 error = port_alloc_event(pnotify.portnfy_port, 3176 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 3177 &lpkevp); 3178 if (error == 0) { 3179 port_init_event(lpkevp, (uintptr_t)*ucbp, 3180 (void *)(uintptr_t)pnotify.portnfy_user, 3181 aio_port_callback, reqp); 3182 lpkevp->portkev_events = event; 3183 reqp->aio_req_portkev = lpkevp; 3184 reqp->aio_req_port = pnotify.portnfy_port; 3185 } 3186 } 3187 3188 /* 3189 * send the request to driver. 3190 */ 3191 if (error == 0) { 3192 if (aiocb->aio_nbytes == 0) { 3193 clear_active_fd(aiocb->aio_fildes); 3194 aio_zerolen(reqp); 3195 continue; 3196 } 3197 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 3198 CRED()); 3199 } 3200 3201 /* 3202 * the fd's ref count is not decremented until the IO has 3203 * completed unless there was an error. 3204 */ 3205 if (error) { 3206 releasef(aiocb->aio_fildes); 3207 lio_set_uerror(&cbp->aio_resultp, error); 3208 if (head) { 3209 mutex_enter(&aiop->aio_mutex); 3210 head->lio_nent--; 3211 head->lio_refcnt--; 3212 mutex_exit(&aiop->aio_mutex); 3213 } 3214 if (error == ENOTSUP) 3215 aio_notsupported++; 3216 else 3217 aio_errors++; 3218 lio_set_error(reqp, portused); 3219 } else { 3220 clear_active_fd(aiocb->aio_fildes); 3221 } 3222 } 3223 3224 if (aio_notsupported) { 3225 error = ENOTSUP; 3226 } else if (aio_errors) { 3227 /* 3228 * return EIO if any request failed 3229 */ 3230 error = EIO; 3231 } 3232 3233 if (mode_arg == LIO_WAIT) { 3234 mutex_enter(&aiop->aio_mutex); 3235 while (head->lio_refcnt > 0) { 3236 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 3237 mutex_exit(&aiop->aio_mutex); 3238 error = EINTR; 3239 goto done; 3240 } 3241 } 3242 mutex_exit(&aiop->aio_mutex); 3243 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE); 3244 } 3245 3246 done: 3247 kmem_free(cbplist, ssize); 3248 if (deadhead) { 3249 if (head->lio_sigqp) 3250 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 3251 if (head->lio_portkev) 3252 port_free_event(head->lio_portkev); 3253 kmem_free(head, sizeof (aio_lio_t)); 3254 } 3255 return (error); 3256 } 3257 3258 #ifdef _SYSCALL32_IMPL 3259 static void 3260 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest) 3261 { 3262 dest->aio_fildes = src->aio_fildes; 3263 dest->aio_buf = (void *)(uintptr_t)src->aio_buf; 3264 dest->aio_nbytes = (size_t)src->aio_nbytes; 3265 dest->aio_offset = (off_t)src->aio_offset; 3266 dest->aio_reqprio = src->aio_reqprio; 3267 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify; 3268 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo; 3269 3270 /* 3271 * See comment in sigqueue32() on handling of 32-bit 3272 * sigvals in a 64-bit kernel. 3273 */ 3274 dest->aio_sigevent.sigev_value.sival_int = 3275 (int)src->aio_sigevent.sigev_value.sival_int; 3276 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval)) 3277 (uintptr_t)src->aio_sigevent.sigev_notify_function; 3278 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *) 3279 (uintptr_t)src->aio_sigevent.sigev_notify_attributes; 3280 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2; 3281 dest->aio_lio_opcode = src->aio_lio_opcode; 3282 dest->aio_state = src->aio_state; 3283 dest->aio__pad[0] = src->aio__pad[0]; 3284 } 3285 #endif 3286 3287 /* 3288 * This function is used only for largefile calls made by 3289 * 32 bit applications. 3290 */ 3291 static int 3292 aio_req_setupLF( 3293 aio_req_t **reqpp, 3294 aio_t *aiop, 3295 aiocb64_32_t *arg, 3296 aio_result_t *resultp, 3297 vnode_t *vp, 3298 int old_solaris_req) 3299 { 3300 sigqueue_t *sqp = NULL; 3301 aio_req_t *reqp; 3302 struct uio *uio; 3303 struct sigevent32 *sigev; 3304 int error; 3305 3306 sigev = &arg->aio_sigevent; 3307 if (sigev->sigev_notify == SIGEV_SIGNAL && 3308 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) { 3309 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3310 if (sqp == NULL) 3311 return (EAGAIN); 3312 sqp->sq_func = NULL; 3313 sqp->sq_next = NULL; 3314 sqp->sq_info.si_code = SI_ASYNCIO; 3315 sqp->sq_info.si_pid = curproc->p_pid; 3316 sqp->sq_info.si_ctid = PRCTID(curproc); 3317 sqp->sq_info.si_zoneid = getzoneid(); 3318 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3319 sqp->sq_info.si_signo = sigev->sigev_signo; 3320 sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int; 3321 } 3322 3323 mutex_enter(&aiop->aio_mutex); 3324 3325 if (aiop->aio_flags & AIO_REQ_BLOCK) { 3326 mutex_exit(&aiop->aio_mutex); 3327 if (sqp) 3328 kmem_free(sqp, sizeof (sigqueue_t)); 3329 return (EIO); 3330 } 3331 /* 3332 * get an aio_reqp from the free list or allocate one 3333 * from dynamic memory. 3334 */ 3335 if (error = aio_req_alloc(&reqp, resultp)) { 3336 mutex_exit(&aiop->aio_mutex); 3337 if (sqp) 3338 kmem_free(sqp, sizeof (sigqueue_t)); 3339 return (error); 3340 } 3341 aiop->aio_pending++; 3342 aiop->aio_outstanding++; 3343 reqp->aio_req_flags = AIO_PENDING; 3344 if (old_solaris_req) { 3345 /* this is an old solaris aio request */ 3346 reqp->aio_req_flags |= AIO_SOLARIS; 3347 aiop->aio_flags |= AIO_SOLARIS_REQ; 3348 } 3349 if (sigev->sigev_notify == SIGEV_THREAD || 3350 sigev->sigev_notify == SIGEV_PORT) 3351 aio_enq(&aiop->aio_portpending, reqp, 0); 3352 mutex_exit(&aiop->aio_mutex); 3353 /* 3354 * initialize aio request. 3355 */ 3356 reqp->aio_req_fd = arg->aio_fildes; 3357 reqp->aio_req_sigqp = sqp; 3358 reqp->aio_req_iocb.iocb = NULL; 3359 reqp->aio_req_lio = NULL; 3360 reqp->aio_req_buf.b_file = vp; 3361 uio = reqp->aio_req.aio_uio; 3362 uio->uio_iovcnt = 1; 3363 uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf; 3364 uio->uio_iov->iov_len = arg->aio_nbytes; 3365 uio->uio_loffset = arg->aio_offset; 3366 *reqpp = reqp; 3367 return (0); 3368 } 3369 3370 /* 3371 * This routine is called when a non largefile call is made by a 32bit 3372 * process on a ILP32 or LP64 kernel. 3373 */ 3374 static int 3375 alio32( 3376 int mode_arg, 3377 void *aiocb_arg, 3378 int nent, 3379 void *sigev) 3380 { 3381 file_t *fp; 3382 file_t *prev_fp = NULL; 3383 int prev_mode = -1; 3384 struct vnode *vp; 3385 aio_lio_t *head; 3386 aio_req_t *reqp; 3387 aio_t *aiop; 3388 caddr_t cbplist; 3389 aiocb_t cb; 3390 aiocb_t *aiocb = &cb; 3391 #ifdef _LP64 3392 aiocb32_t *cbp; 3393 caddr32_t *ucbp; 3394 aiocb32_t cb32; 3395 aiocb32_t *aiocb32 = &cb32; 3396 struct sigevent32 sigevk; 3397 #else 3398 aiocb_t *cbp, **ucbp; 3399 struct sigevent sigevk; 3400 #endif 3401 sigqueue_t *sqp; 3402 int (*aio_func)(); 3403 int mode; 3404 int error = 0; 3405 int aio_errors = 0; 3406 int i; 3407 size_t ssize; 3408 int deadhead = 0; 3409 int aio_notsupported = 0; 3410 int lio_head_port; 3411 int aio_port; 3412 int aio_thread; 3413 port_kevent_t *pkevtp = NULL; 3414 int portused = 0; 3415 #ifdef _LP64 3416 port_notify32_t pnotify; 3417 #else 3418 port_notify_t pnotify; 3419 #endif 3420 int event; 3421 3422 aiop = curproc->p_aio; 3423 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 3424 return (EINVAL); 3425 3426 #ifdef _LP64 3427 ssize = (sizeof (caddr32_t) * nent); 3428 #else 3429 ssize = (sizeof (aiocb_t *) * nent); 3430 #endif 3431 cbplist = kmem_alloc(ssize, KM_SLEEP); 3432 ucbp = (void *)cbplist; 3433 3434 if (copyin(aiocb_arg, cbplist, ssize) || 3435 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) { 3436 kmem_free(cbplist, ssize); 3437 return (EFAULT); 3438 } 3439 3440 /* Event Ports */ 3441 if (sigev && 3442 (sigevk.sigev_notify == SIGEV_THREAD || 3443 sigevk.sigev_notify == SIGEV_PORT)) { 3444 if (sigevk.sigev_notify == SIGEV_THREAD) { 3445 pnotify.portnfy_port = sigevk.sigev_signo; 3446 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 3447 } else if (copyin( 3448 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr, 3449 &pnotify, sizeof (pnotify))) { 3450 kmem_free(cbplist, ssize); 3451 return (EFAULT); 3452 } 3453 error = port_alloc_event(pnotify.portnfy_port, 3454 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 3455 if (error) { 3456 if (error == ENOMEM || error == EAGAIN) 3457 error = EAGAIN; 3458 else 3459 error = EINVAL; 3460 kmem_free(cbplist, ssize); 3461 return (error); 3462 } 3463 lio_head_port = pnotify.portnfy_port; 3464 portused = 1; 3465 } 3466 3467 /* 3468 * a list head should be allocated if notification is 3469 * enabled for this list. 3470 */ 3471 head = NULL; 3472 3473 if (mode_arg == LIO_WAIT || sigev) { 3474 mutex_enter(&aiop->aio_mutex); 3475 error = aio_lio_alloc(&head); 3476 mutex_exit(&aiop->aio_mutex); 3477 if (error) 3478 goto done; 3479 deadhead = 1; 3480 head->lio_nent = nent; 3481 head->lio_refcnt = nent; 3482 head->lio_port = -1; 3483 head->lio_portkev = NULL; 3484 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 3485 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 3486 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3487 if (sqp == NULL) { 3488 error = EAGAIN; 3489 goto done; 3490 } 3491 sqp->sq_func = NULL; 3492 sqp->sq_next = NULL; 3493 sqp->sq_info.si_code = SI_ASYNCIO; 3494 sqp->sq_info.si_pid = curproc->p_pid; 3495 sqp->sq_info.si_ctid = PRCTID(curproc); 3496 sqp->sq_info.si_zoneid = getzoneid(); 3497 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3498 sqp->sq_info.si_signo = sigevk.sigev_signo; 3499 sqp->sq_info.si_value.sival_int = 3500 sigevk.sigev_value.sival_int; 3501 head->lio_sigqp = sqp; 3502 } else { 3503 head->lio_sigqp = NULL; 3504 } 3505 if (pkevtp) { 3506 /* 3507 * Prepare data to send when list of aiocb's has 3508 * completed. 3509 */ 3510 port_init_event(pkevtp, (uintptr_t)sigev, 3511 (void *)(uintptr_t)pnotify.portnfy_user, 3512 NULL, head); 3513 pkevtp->portkev_events = AIOLIO; 3514 head->lio_portkev = pkevtp; 3515 head->lio_port = pnotify.portnfy_port; 3516 } 3517 } 3518 3519 for (i = 0; i < nent; i++, ucbp++) { 3520 3521 /* skip entry if it can't be copied. */ 3522 #ifdef _LP64 3523 cbp = (aiocb32_t *)(uintptr_t)*ucbp; 3524 if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32))) 3525 #else 3526 cbp = (aiocb_t *)*ucbp; 3527 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) 3528 #endif 3529 { 3530 if (head) { 3531 mutex_enter(&aiop->aio_mutex); 3532 head->lio_nent--; 3533 head->lio_refcnt--; 3534 mutex_exit(&aiop->aio_mutex); 3535 } 3536 continue; 3537 } 3538 #ifdef _LP64 3539 /* 3540 * copy 32 bit structure into 64 bit structure 3541 */ 3542 aiocb_32ton(aiocb32, aiocb); 3543 #endif /* _LP64 */ 3544 3545 /* skip if opcode for aiocb is LIO_NOP */ 3546 mode = aiocb->aio_lio_opcode; 3547 if (mode == LIO_NOP) { 3548 cbp = NULL; 3549 if (head) { 3550 mutex_enter(&aiop->aio_mutex); 3551 head->lio_nent--; 3552 head->lio_refcnt--; 3553 mutex_exit(&aiop->aio_mutex); 3554 } 3555 continue; 3556 } 3557 3558 /* increment file descriptor's ref count. */ 3559 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 3560 lio_set_uerror(&cbp->aio_resultp, EBADF); 3561 if (head) { 3562 mutex_enter(&aiop->aio_mutex); 3563 head->lio_nent--; 3564 head->lio_refcnt--; 3565 mutex_exit(&aiop->aio_mutex); 3566 } 3567 aio_errors++; 3568 continue; 3569 } 3570 3571 /* 3572 * check the permission of the partition 3573 */ 3574 if ((fp->f_flag & mode) == 0) { 3575 releasef(aiocb->aio_fildes); 3576 lio_set_uerror(&cbp->aio_resultp, EBADF); 3577 if (head) { 3578 mutex_enter(&aiop->aio_mutex); 3579 head->lio_nent--; 3580 head->lio_refcnt--; 3581 mutex_exit(&aiop->aio_mutex); 3582 } 3583 aio_errors++; 3584 continue; 3585 } 3586 3587 /* 3588 * common case where requests are to the same fd 3589 * for the same r/w operation 3590 * for UFS, need to set EBADFD 3591 */ 3592 vp = fp->f_vnode; 3593 if (fp != prev_fp || mode != prev_mode) { 3594 aio_func = check_vp(vp, mode); 3595 if (aio_func == NULL) { 3596 prev_fp = NULL; 3597 releasef(aiocb->aio_fildes); 3598 lio_set_uerror(&cbp->aio_resultp, EBADFD); 3599 aio_notsupported++; 3600 if (head) { 3601 mutex_enter(&aiop->aio_mutex); 3602 head->lio_nent--; 3603 head->lio_refcnt--; 3604 mutex_exit(&aiop->aio_mutex); 3605 } 3606 continue; 3607 } else { 3608 prev_fp = fp; 3609 prev_mode = mode; 3610 } 3611 } 3612 3613 error = aio_req_setup(&reqp, aiop, aiocb, 3614 (aio_result_t *)&cbp->aio_resultp, vp, 0); 3615 if (error) { 3616 releasef(aiocb->aio_fildes); 3617 lio_set_uerror(&cbp->aio_resultp, error); 3618 if (head) { 3619 mutex_enter(&aiop->aio_mutex); 3620 head->lio_nent--; 3621 head->lio_refcnt--; 3622 mutex_exit(&aiop->aio_mutex); 3623 } 3624 aio_errors++; 3625 continue; 3626 } 3627 3628 reqp->aio_req_lio = head; 3629 deadhead = 0; 3630 3631 /* 3632 * Set the errno field now before sending the request to 3633 * the driver to avoid a race condition 3634 */ 3635 (void) suword32(&cbp->aio_resultp.aio_errno, 3636 EINPROGRESS); 3637 3638 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp; 3639 3640 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE; 3641 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 3642 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 3643 if (aio_port | aio_thread) { 3644 port_kevent_t *lpkevp; 3645 /* 3646 * Prepare data to send with each aiocb completed. 3647 */ 3648 #ifdef _LP64 3649 if (aio_port) { 3650 void *paddr = (void *)(uintptr_t) 3651 aiocb32->aio_sigevent.sigev_value.sival_ptr; 3652 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3653 error = EFAULT; 3654 } else { /* aio_thread */ 3655 pnotify.portnfy_port = 3656 aiocb32->aio_sigevent.sigev_signo; 3657 pnotify.portnfy_user = 3658 aiocb32->aio_sigevent.sigev_value.sival_ptr; 3659 } 3660 #else 3661 if (aio_port) { 3662 void *paddr = 3663 aiocb->aio_sigevent.sigev_value.sival_ptr; 3664 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3665 error = EFAULT; 3666 } else { /* aio_thread */ 3667 pnotify.portnfy_port = 3668 aiocb->aio_sigevent.sigev_signo; 3669 pnotify.portnfy_user = 3670 aiocb->aio_sigevent.sigev_value.sival_ptr; 3671 } 3672 #endif 3673 if (error) 3674 /* EMPTY */; 3675 else if (pkevtp != NULL && 3676 pnotify.portnfy_port == lio_head_port) 3677 error = port_dup_event(pkevtp, &lpkevp, 3678 PORT_ALLOC_DEFAULT); 3679 else 3680 error = port_alloc_event(pnotify.portnfy_port, 3681 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 3682 &lpkevp); 3683 if (error == 0) { 3684 port_init_event(lpkevp, (uintptr_t)cbp, 3685 (void *)(uintptr_t)pnotify.portnfy_user, 3686 aio_port_callback, reqp); 3687 lpkevp->portkev_events = event; 3688 reqp->aio_req_portkev = lpkevp; 3689 reqp->aio_req_port = pnotify.portnfy_port; 3690 } 3691 } 3692 3693 /* 3694 * send the request to driver. 3695 */ 3696 if (error == 0) { 3697 if (aiocb->aio_nbytes == 0) { 3698 clear_active_fd(aiocb->aio_fildes); 3699 aio_zerolen(reqp); 3700 continue; 3701 } 3702 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 3703 CRED()); 3704 } 3705 3706 /* 3707 * the fd's ref count is not decremented until the IO has 3708 * completed unless there was an error. 3709 */ 3710 if (error) { 3711 releasef(aiocb->aio_fildes); 3712 lio_set_uerror(&cbp->aio_resultp, error); 3713 if (head) { 3714 mutex_enter(&aiop->aio_mutex); 3715 head->lio_nent--; 3716 head->lio_refcnt--; 3717 mutex_exit(&aiop->aio_mutex); 3718 } 3719 if (error == ENOTSUP) 3720 aio_notsupported++; 3721 else 3722 aio_errors++; 3723 lio_set_error(reqp, portused); 3724 } else { 3725 clear_active_fd(aiocb->aio_fildes); 3726 } 3727 } 3728 3729 if (aio_notsupported) { 3730 error = ENOTSUP; 3731 } else if (aio_errors) { 3732 /* 3733 * return EIO if any request failed 3734 */ 3735 error = EIO; 3736 } 3737 3738 if (mode_arg == LIO_WAIT) { 3739 mutex_enter(&aiop->aio_mutex); 3740 while (head->lio_refcnt > 0) { 3741 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 3742 mutex_exit(&aiop->aio_mutex); 3743 error = EINTR; 3744 goto done; 3745 } 3746 } 3747 mutex_exit(&aiop->aio_mutex); 3748 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32); 3749 } 3750 3751 done: 3752 kmem_free(cbplist, ssize); 3753 if (deadhead) { 3754 if (head->lio_sigqp) 3755 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 3756 if (head->lio_portkev) 3757 port_free_event(head->lio_portkev); 3758 kmem_free(head, sizeof (aio_lio_t)); 3759 } 3760 return (error); 3761 } 3762 3763 3764 #ifdef _SYSCALL32_IMPL 3765 void 3766 aiocb_32ton(aiocb32_t *src, aiocb_t *dest) 3767 { 3768 dest->aio_fildes = src->aio_fildes; 3769 dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf; 3770 dest->aio_nbytes = (size_t)src->aio_nbytes; 3771 dest->aio_offset = (off_t)src->aio_offset; 3772 dest->aio_reqprio = src->aio_reqprio; 3773 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify; 3774 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo; 3775 3776 /* 3777 * See comment in sigqueue32() on handling of 32-bit 3778 * sigvals in a 64-bit kernel. 3779 */ 3780 dest->aio_sigevent.sigev_value.sival_int = 3781 (int)src->aio_sigevent.sigev_value.sival_int; 3782 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval)) 3783 (uintptr_t)src->aio_sigevent.sigev_notify_function; 3784 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *) 3785 (uintptr_t)src->aio_sigevent.sigev_notify_attributes; 3786 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2; 3787 dest->aio_lio_opcode = src->aio_lio_opcode; 3788 dest->aio_state = src->aio_state; 3789 dest->aio__pad[0] = src->aio__pad[0]; 3790 } 3791 #endif /* _SYSCALL32_IMPL */ 3792 3793 /* 3794 * aio_port_callback() is called just before the event is retrieved from the 3795 * port. The task of this callback function is to finish the work of the 3796 * transaction for the application, it means : 3797 * - copyout transaction data to the application 3798 * (this thread is running in the right process context) 3799 * - keep trace of the transaction (update of counters). 3800 * - free allocated buffers 3801 * The aiocb pointer is the object element of the port_kevent_t structure. 3802 * 3803 * flag : 3804 * PORT_CALLBACK_DEFAULT : do copyout and free resources 3805 * PORT_CALLBACK_CLOSE : don't do copyout, free resources 3806 */ 3807 3808 /*ARGSUSED*/ 3809 int 3810 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp) 3811 { 3812 aio_t *aiop = curproc->p_aio; 3813 aio_req_t *reqp = arg; 3814 struct iovec *iov; 3815 struct buf *bp; 3816 void *resultp; 3817 3818 if (pid != curproc->p_pid) { 3819 /* wrong proc !!, can not deliver data here ... */ 3820 return (EACCES); 3821 } 3822 3823 mutex_enter(&aiop->aio_portq_mutex); 3824 reqp->aio_req_portkev = NULL; 3825 aio_req_remove_portq(aiop, reqp); /* remove request from portq */ 3826 mutex_exit(&aiop->aio_portq_mutex); 3827 aphysio_unlock(reqp); /* unlock used pages */ 3828 mutex_enter(&aiop->aio_mutex); 3829 if (reqp->aio_req_flags & AIO_COPYOUTDONE) { 3830 aio_req_free_port(aiop, reqp); /* back to free list */ 3831 mutex_exit(&aiop->aio_mutex); 3832 return (0); 3833 } 3834 3835 iov = reqp->aio_req_uio.uio_iov; 3836 bp = &reqp->aio_req_buf; 3837 resultp = (void *)reqp->aio_req_resultp; 3838 if (flag == PORT_CALLBACK_DEFAULT) 3839 aio_copyout_result_port(iov, bp, resultp); 3840 aio_req_free_port(aiop, reqp); /* request struct back to free list */ 3841 mutex_exit(&aiop->aio_mutex); 3842 return (0); 3843 } 3844