1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD$ 17 */ 18 19 /* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23 #include <sys/param.h> 24 #include <sys/systm.h> 25 #include <sys/bio.h> 26 #include <sys/buf.h> 27 #include <sys/sysproto.h> 28 #include <sys/filedesc.h> 29 #include <sys/kernel.h> 30 #include <sys/kthread.h> 31 #include <sys/fcntl.h> 32 #include <sys/file.h> 33 #include <sys/lock.h> 34 #include <sys/mutex.h> 35 #include <sys/unistd.h> 36 #include <sys/proc.h> 37 #include <sys/resourcevar.h> 38 #include <sys/signalvar.h> 39 #include <sys/protosw.h> 40 #include <sys/socketvar.h> 41 #include <sys/syscall.h> 42 #include <sys/sysent.h> 43 #include <sys/sysctl.h> 44 #include <sys/vnode.h> 45 #include <sys/conf.h> 46 #include <sys/event.h> 47 48 #include <vm/vm.h> 49 #include <vm/vm_extern.h> 50 #include <vm/pmap.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_zone.h> 53 #include <sys/aio.h> 54 55 #include <machine/limits.h> 56 57 #include "opt_vfs_aio.h" 58 59 static long jobrefid; 60 61 #define JOBST_NULL 0x0 62 #define JOBST_JOBQGLOBAL 0x2 63 #define JOBST_JOBRUNNING 0x3 64 #define JOBST_JOBFINISHED 0x4 65 #define JOBST_JOBQBUF 0x5 66 #define JOBST_JOBBFINISHED 0x6 67 68 #ifndef MAX_AIO_PER_PROC 69 #define MAX_AIO_PER_PROC 32 70 #endif 71 72 #ifndef MAX_AIO_QUEUE_PER_PROC 73 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 74 #endif 75 76 #ifndef MAX_AIO_PROCS 77 #define MAX_AIO_PROCS 32 78 #endif 79 80 #ifndef MAX_AIO_QUEUE 81 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 82 #endif 83 84 #ifndef TARGET_AIO_PROCS 85 #define TARGET_AIO_PROCS 4 86 #endif 87 88 #ifndef MAX_BUF_AIO 89 #define MAX_BUF_AIO 16 90 #endif 91 92 #ifndef AIOD_TIMEOUT_DEFAULT 93 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 94 #endif 95 96 #ifndef AIOD_LIFETIME_DEFAULT 97 #define AIOD_LIFETIME_DEFAULT (30 * hz) 98 #endif 99 100 static int max_aio_procs = MAX_AIO_PROCS; 101 static int num_aio_procs = 0; 102 static int target_aio_procs = TARGET_AIO_PROCS; 103 static int max_queue_count = MAX_AIO_QUEUE; 104 static int num_queue_count = 0; 105 static int num_buf_aio = 0; 106 static int num_aio_resv_start = 0; 107 static int aiod_timeout; 108 static int aiod_lifetime; 109 static int unloadable = 0; 110 111 static int max_aio_per_proc = MAX_AIO_PER_PROC; 112 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 113 static int max_buf_aio = MAX_BUF_AIO; 114 115 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 116 117 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 118 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 119 120 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 121 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 122 123 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 124 CTLFLAG_RW, &max_aio_procs, 0, ""); 125 126 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 127 CTLFLAG_RD, &num_aio_procs, 0, ""); 128 129 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 130 CTLFLAG_RD, &num_queue_count, 0, ""); 131 132 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 133 CTLFLAG_RW, &max_queue_count, 0, ""); 134 135 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 136 CTLFLAG_RW, &target_aio_procs, 0, ""); 137 138 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 139 CTLFLAG_RW, &max_buf_aio, 0, ""); 140 141 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 142 CTLFLAG_RD, &num_buf_aio, 0, ""); 143 144 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 145 CTLFLAG_RW, &aiod_lifetime, 0, ""); 146 147 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 148 CTLFLAG_RW, &aiod_timeout, 0, ""); 149 150 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, 151 "Allow unload of aio (not recommended)"); 152 153 struct aiocblist { 154 TAILQ_ENTRY(aiocblist) list; /* List of jobs */ 155 TAILQ_ENTRY(aiocblist) plist; /* List of jobs for proc */ 156 int jobflags; 157 int jobstate; 158 int inputcharge; 159 int outputcharge; 160 struct callout_handle timeouthandle; 161 struct buf *bp; /* Buffer pointer */ 162 struct proc *userproc; /* User process */ /* Not td! */ 163 struct file *fd_file; /* Pointer to file structure */ 164 struct aiothreadlist *jobaiothread; /* AIO process descriptor */ 165 struct aio_liojob *lio; /* Optional lio job */ 166 struct aiocb *uuaiocb; /* Pointer in userspace of aiocb */ 167 struct klist klist; /* list of knotes */ 168 struct aiocb uaiocb; /* Kernel I/O control block */ 169 }; 170 171 /* jobflags */ 172 #define AIOCBLIST_RUNDOWN 0x4 173 #define AIOCBLIST_ASYNCFREE 0x8 174 #define AIOCBLIST_DONE 0x10 175 176 /* 177 * AIO process info 178 */ 179 #define AIOP_FREE 0x1 /* proc on free queue */ 180 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 181 182 struct aiothreadlist { 183 int aiothreadflags; /* AIO proc flags */ 184 TAILQ_ENTRY(aiothreadlist) list; /* List of processes */ 185 struct thread *aiothread; /* The AIO thread */ 186 }; 187 188 /* 189 * data-structure for lio signal management 190 */ 191 struct aio_liojob { 192 int lioj_flags; 193 int lioj_buffer_count; 194 int lioj_buffer_finished_count; 195 int lioj_queue_count; 196 int lioj_queue_finished_count; 197 struct sigevent lioj_signal; /* signal on all I/O done */ 198 TAILQ_ENTRY(aio_liojob) lioj_list; 199 struct kaioinfo *lioj_ki; 200 }; 201 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 202 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 203 204 /* 205 * per process aio data structure 206 */ 207 struct kaioinfo { 208 int kaio_flags; /* per process kaio flags */ 209 int kaio_maxactive_count; /* maximum number of AIOs */ 210 int kaio_active_count; /* number of currently used AIOs */ 211 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 212 int kaio_queue_count; /* size of AIO queue */ 213 int kaio_ballowed_count; /* maximum number of buffers */ 214 int kaio_queue_finished_count; /* number of daemon jobs finished */ 215 int kaio_buffer_count; /* number of physio buffers */ 216 int kaio_buffer_finished_count; /* count of I/O done */ 217 struct proc *kaio_p; /* process that uses this kaio block */ 218 TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 219 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */ 220 TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */ 221 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 222 TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */ 223 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ 224 }; 225 226 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 227 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 228 229 static TAILQ_HEAD(,aiothreadlist) aio_freeproc, aio_activeproc; 230 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 231 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 232 233 static void aio_init_aioinfo(struct proc *p); 234 static void aio_onceonly(void); 235 static int aio_free_entry(struct aiocblist *aiocbe); 236 static void aio_process(struct aiocblist *aiocbe); 237 static int aio_newproc(void); 238 static int aio_aqueue(struct thread *td, struct aiocb *job, int type); 239 static void aio_physwakeup(struct buf *bp); 240 static void aio_proc_rundown(struct proc *p); 241 static int aio_fphysio(struct aiocblist *aiocbe); 242 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 243 static void aio_daemon(void *uproc); 244 static void aio_swake_cb(struct socket *, struct sockbuf *); 245 static int aio_unload(void); 246 static void process_signal(void *aioj); 247 static int filt_aioattach(struct knote *kn); 248 static void filt_aiodetach(struct knote *kn); 249 static int filt_aio(struct knote *kn, long hint); 250 251 static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone; 252 static vm_zone_t aiolio_zone; 253 254 static struct filterops aio_filtops = 255 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 256 257 static int 258 aio_modload(struct module *module, int cmd, void *arg) 259 { 260 int error = 0; 261 262 switch (cmd) { 263 case MOD_LOAD: 264 aio_onceonly(); 265 break; 266 case MOD_UNLOAD: 267 error = aio_unload(); 268 break; 269 case MOD_SHUTDOWN: 270 break; 271 default: 272 error = EINVAL; 273 break; 274 } 275 return (error); 276 } 277 278 static moduledata_t aio_mod = { 279 "aio", 280 &aio_modload, 281 NULL 282 }; 283 284 SYSCALL_MODULE_HELPER(aio_return); 285 SYSCALL_MODULE_HELPER(aio_suspend); 286 SYSCALL_MODULE_HELPER(aio_cancel); 287 SYSCALL_MODULE_HELPER(aio_error); 288 SYSCALL_MODULE_HELPER(aio_read); 289 SYSCALL_MODULE_HELPER(aio_write); 290 SYSCALL_MODULE_HELPER(aio_waitcomplete); 291 SYSCALL_MODULE_HELPER(lio_listio); 292 293 DECLARE_MODULE(aio, aio_mod, 294 SI_SUB_VFS, SI_ORDER_ANY); 295 MODULE_VERSION(aio, 1); 296 297 /* 298 * Startup initialization 299 */ 300 static void 301 aio_onceonly(void) 302 { 303 304 /* XXX: should probably just use so->callback */ 305 aio_swake = &aio_swake_cb; 306 at_exit(aio_proc_rundown); 307 at_exec(aio_proc_rundown); 308 kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); 309 TAILQ_INIT(&aio_freeproc); 310 TAILQ_INIT(&aio_activeproc); 311 TAILQ_INIT(&aio_jobs); 312 TAILQ_INIT(&aio_bufjobs); 313 kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1); 314 aiop_zone = zinit("AIOP", sizeof(struct aiothreadlist), 0, 0, 1); 315 aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1); 316 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1); 317 aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1); 318 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 319 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 320 jobrefid = 1; 321 } 322 323 static int 324 aio_unload(void) 325 { 326 327 /* 328 * XXX: no unloads by default, it's too dangerous. 329 * perhaps we could do it if locked out callers and then 330 * did an aio_proc_rundown() on each process. 331 */ 332 if (!unloadable) 333 return (EOPNOTSUPP); 334 335 aio_swake = NULL; 336 rm_at_exit(aio_proc_rundown); 337 rm_at_exec(aio_proc_rundown); 338 kqueue_del_filteropts(EVFILT_AIO); 339 return (0); 340 } 341 342 /* 343 * Init the per-process aioinfo structure. The aioinfo limits are set 344 * per-process for user limit (resource) management. 345 */ 346 static void 347 aio_init_aioinfo(struct proc *p) 348 { 349 struct kaioinfo *ki; 350 if (p->p_aioinfo == NULL) { 351 ki = zalloc(kaio_zone); 352 p->p_aioinfo = ki; 353 ki->kaio_flags = 0; 354 ki->kaio_maxactive_count = max_aio_per_proc; 355 ki->kaio_active_count = 0; 356 ki->kaio_qallowed_count = max_aio_queue_per_proc; 357 ki->kaio_queue_count = 0; 358 ki->kaio_ballowed_count = max_buf_aio; 359 ki->kaio_buffer_count = 0; 360 ki->kaio_buffer_finished_count = 0; 361 ki->kaio_p = p; 362 TAILQ_INIT(&ki->kaio_jobdone); 363 TAILQ_INIT(&ki->kaio_jobqueue); 364 TAILQ_INIT(&ki->kaio_bufdone); 365 TAILQ_INIT(&ki->kaio_bufqueue); 366 TAILQ_INIT(&ki->kaio_liojoblist); 367 TAILQ_INIT(&ki->kaio_sockqueue); 368 } 369 370 while (num_aio_procs < target_aio_procs) 371 aio_newproc(); 372 } 373 374 /* 375 * Free a job entry. Wait for completion if it is currently active, but don't 376 * delay forever. If we delay, we return a flag that says that we have to 377 * restart the queue scan. 378 */ 379 static int 380 aio_free_entry(struct aiocblist *aiocbe) 381 { 382 struct kaioinfo *ki; 383 struct aio_liojob *lj; 384 struct proc *p; 385 int error; 386 int s; 387 388 if (aiocbe->jobstate == JOBST_NULL) 389 panic("aio_free_entry: freeing already free job"); 390 391 p = aiocbe->userproc; 392 ki = p->p_aioinfo; 393 lj = aiocbe->lio; 394 if (ki == NULL) 395 panic("aio_free_entry: missing p->p_aioinfo"); 396 397 while (aiocbe->jobstate == JOBST_JOBRUNNING) { 398 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 399 return 0; 400 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 401 tsleep(aiocbe, PRIBIO, "jobwai", 0); 402 } 403 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 404 405 if (aiocbe->bp == NULL) { 406 if (ki->kaio_queue_count <= 0) 407 panic("aio_free_entry: process queue size <= 0"); 408 if (num_queue_count <= 0) 409 panic("aio_free_entry: system wide queue size <= 0"); 410 411 if (lj) { 412 lj->lioj_queue_count--; 413 if (aiocbe->jobflags & AIOCBLIST_DONE) 414 lj->lioj_queue_finished_count--; 415 } 416 ki->kaio_queue_count--; 417 if (aiocbe->jobflags & AIOCBLIST_DONE) 418 ki->kaio_queue_finished_count--; 419 num_queue_count--; 420 } else { 421 if (lj) { 422 lj->lioj_buffer_count--; 423 if (aiocbe->jobflags & AIOCBLIST_DONE) 424 lj->lioj_buffer_finished_count--; 425 } 426 if (aiocbe->jobflags & AIOCBLIST_DONE) 427 ki->kaio_buffer_finished_count--; 428 ki->kaio_buffer_count--; 429 num_buf_aio--; 430 } 431 432 /* aiocbe is going away, we need to destroy any knotes */ 433 knote_remove(&p->p_thread, &aiocbe->klist); /* XXXKSE */ 434 /* XXXKSE Note the thread here is used to eventually find the 435 * owning process again, but it is also used to do a fo_close 436 * and that requires the thread. (but does it require the 437 * OWNING thread? (or maby the running thread?) 438 * There is a semantic problem here... 439 */ 440 441 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) 442 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { 443 ki->kaio_flags &= ~KAIO_WAKEUP; 444 wakeup(p); 445 } 446 447 if (aiocbe->jobstate == JOBST_JOBQBUF) { 448 if ((error = aio_fphysio(aiocbe)) != 0) 449 return error; 450 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 451 panic("aio_free_entry: invalid physio finish-up state"); 452 s = splbio(); 453 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 454 splx(s); 455 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) { 456 s = splnet(); 457 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 458 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 459 splx(s); 460 } else if (aiocbe->jobstate == JOBST_JOBFINISHED) 461 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 462 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { 463 s = splbio(); 464 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 465 splx(s); 466 if (aiocbe->bp) { 467 vunmapbuf(aiocbe->bp); 468 relpbuf(aiocbe->bp, NULL); 469 aiocbe->bp = NULL; 470 } 471 } 472 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 473 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 474 zfree(aiolio_zone, lj); 475 } 476 aiocbe->jobstate = JOBST_NULL; 477 untimeout(process_signal, aiocbe, aiocbe->timeouthandle); 478 zfree(aiocb_zone, aiocbe); 479 return 0; 480 } 481 482 /* 483 * Rundown the jobs for a given process. 484 */ 485 static void 486 aio_proc_rundown(struct proc *p) 487 { 488 int s; 489 struct kaioinfo *ki; 490 struct aio_liojob *lj, *ljn; 491 struct aiocblist *aiocbe, *aiocbn; 492 struct file *fp; 493 struct filedesc *fdp; 494 struct socket *so; 495 496 ki = p->p_aioinfo; 497 if (ki == NULL) 498 return; 499 500 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 501 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > 502 ki->kaio_buffer_finished_count)) { 503 ki->kaio_flags |= KAIO_RUNDOWN; 504 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 505 break; 506 } 507 508 /* 509 * Move any aio ops that are waiting on socket I/O to the normal job 510 * queues so they are cleaned up with any others. 511 */ 512 fdp = p->p_fd; 513 514 s = splnet(); 515 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = 516 aiocbn) { 517 aiocbn = TAILQ_NEXT(aiocbe, plist); 518 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes]; 519 520 /* 521 * Under some circumstances, the aio_fildes and the file 522 * structure don't match. This would leave aiocbe's in the 523 * TAILQ associated with the socket and cause a panic later. 524 * 525 * Detect and fix. 526 */ 527 if ((fp == NULL) || (fp != aiocbe->fd_file)) 528 fp = aiocbe->fd_file; 529 if (fp) { 530 so = (struct socket *)fp->f_data; 531 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); 532 if (TAILQ_EMPTY(&so->so_aiojobq)) { 533 so->so_snd.sb_flags &= ~SB_AIO; 534 so->so_rcv.sb_flags &= ~SB_AIO; 535 } 536 } 537 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); 538 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); 539 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); 540 } 541 splx(s); 542 543 restart1: 544 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { 545 aiocbn = TAILQ_NEXT(aiocbe, plist); 546 if (aio_free_entry(aiocbe)) 547 goto restart1; 548 } 549 550 restart2: 551 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = 552 aiocbn) { 553 aiocbn = TAILQ_NEXT(aiocbe, plist); 554 if (aio_free_entry(aiocbe)) 555 goto restart2; 556 } 557 558 /* 559 * Note the use of lots of splbio here, trying to avoid splbio for long chains 560 * of I/O. Probably unnecessary. 561 */ 562 restart3: 563 s = splbio(); 564 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 565 ki->kaio_flags |= KAIO_WAKEUP; 566 tsleep(p, PRIBIO, "aioprn", 0); 567 splx(s); 568 goto restart3; 569 } 570 splx(s); 571 572 restart4: 573 s = splbio(); 574 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { 575 aiocbn = TAILQ_NEXT(aiocbe, plist); 576 if (aio_free_entry(aiocbe)) { 577 splx(s); 578 goto restart4; 579 } 580 } 581 splx(s); 582 583 /* 584 * If we've slept, jobs might have moved from one queue to another. 585 * Retry rundown if we didn't manage to empty the queues. 586 */ 587 if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL || 588 TAILQ_FIRST(&ki->kaio_jobqueue) != NULL || 589 TAILQ_FIRST(&ki->kaio_bufqueue) != NULL || 590 TAILQ_FIRST(&ki->kaio_bufdone) != NULL) 591 goto restart1; 592 593 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { 594 ljn = TAILQ_NEXT(lj, lioj_list); 595 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 596 0)) { 597 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 598 zfree(aiolio_zone, lj); 599 } else { 600 #ifdef DIAGNOSTIC 601 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " 602 "QF:%d\n", lj->lioj_buffer_count, 603 lj->lioj_buffer_finished_count, 604 lj->lioj_queue_count, 605 lj->lioj_queue_finished_count); 606 #endif 607 } 608 } 609 610 zfree(kaio_zone, ki); 611 p->p_aioinfo = NULL; 612 } 613 614 /* 615 * Select a job to run (called by an AIO daemon). 616 */ 617 static struct aiocblist * 618 aio_selectjob(struct aiothreadlist *aiop) 619 { 620 int s; 621 struct aiocblist *aiocbe; 622 struct kaioinfo *ki; 623 struct proc *userp; 624 625 s = splnet(); 626 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = 627 TAILQ_NEXT(aiocbe, list)) { 628 userp = aiocbe->userproc; 629 ki = userp->p_aioinfo; 630 631 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 632 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 633 splx(s); 634 return aiocbe; 635 } 636 } 637 splx(s); 638 639 return NULL; 640 } 641 642 /* 643 * The AIO processing activity. This is the code that does the I/O request for 644 * the non-physio version of the operations. The normal vn operations are used, 645 * and this code should work in all instances for every type of file, including 646 * pipes, sockets, fifos, and regular files. 647 */ 648 static void 649 aio_process(struct aiocblist *aiocbe) 650 { 651 struct filedesc *fdp; 652 struct thread *td; 653 struct proc *userp; 654 struct proc *mycp; 655 struct aiocb *cb; 656 struct file *fp; 657 struct uio auio; 658 struct iovec aiov; 659 unsigned int fd; 660 int cnt; 661 int error; 662 off_t offset; 663 int oublock_st, oublock_end; 664 int inblock_st, inblock_end; 665 666 userp = aiocbe->userproc; 667 td = curthread; 668 mycp = td->td_proc; 669 cb = &aiocbe->uaiocb; 670 671 fdp = mycp->p_fd; 672 fd = cb->aio_fildes; 673 fp = fdp->fd_ofiles[fd]; 674 675 if ((fp == NULL) || (fp != aiocbe->fd_file)) { 676 cb->_aiocb_private.error = EBADF; 677 cb->_aiocb_private.status = -1; 678 return; 679 } 680 681 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; 682 aiov.iov_len = cb->aio_nbytes; 683 684 auio.uio_iov = &aiov; 685 auio.uio_iovcnt = 1; 686 auio.uio_offset = offset = cb->aio_offset; 687 auio.uio_resid = cb->aio_nbytes; 688 cnt = cb->aio_nbytes; 689 auio.uio_segflg = UIO_USERSPACE; 690 auio.uio_td = td; 691 692 inblock_st = mycp->p_stats->p_ru.ru_inblock; 693 oublock_st = mycp->p_stats->p_ru.ru_oublock; 694 /* 695 * Temporarily bump the ref count while reading to avoid the 696 * descriptor being ripped out from under us. 697 */ 698 fhold(fp); 699 if (cb->aio_lio_opcode == LIO_READ) { 700 auio.uio_rw = UIO_READ; 701 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); 702 } else { 703 auio.uio_rw = UIO_WRITE; 704 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); 705 } 706 fdrop(fp, td); 707 inblock_end = mycp->p_stats->p_ru.ru_inblock; 708 oublock_end = mycp->p_stats->p_ru.ru_oublock; 709 710 aiocbe->inputcharge = inblock_end - inblock_st; 711 aiocbe->outputcharge = oublock_end - oublock_st; 712 713 if ((error) && (auio.uio_resid != cnt)) { 714 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 715 error = 0; 716 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { 717 PROC_LOCK(userp); 718 psignal(userp, SIGPIPE); 719 PROC_UNLOCK(userp); 720 } 721 } 722 723 cnt -= auio.uio_resid; 724 cb->_aiocb_private.error = error; 725 cb->_aiocb_private.status = cnt; 726 } 727 728 /* 729 * The AIO daemon, most of the actual work is done in aio_process, 730 * but the setup (and address space mgmt) is done in this routine. 731 */ 732 static void 733 aio_daemon(void *uproc) 734 { 735 int s; 736 struct aio_liojob *lj; 737 struct aiocb *cb; 738 struct aiocblist *aiocbe; 739 struct aiothreadlist *aiop; 740 struct kaioinfo *ki; 741 struct proc *curcp, *mycp, *userp; 742 struct vmspace *myvm, *tmpvm; 743 struct thread *td = curthread; 744 745 mtx_lock(&Giant); 746 /* 747 * Local copies of curproc (cp) and vmspace (myvm) 748 */ 749 mycp = td->td_proc; 750 myvm = mycp->p_vmspace; 751 752 if (mycp->p_textvp) { 753 vrele(mycp->p_textvp); 754 mycp->p_textvp = NULL; 755 } 756 757 /* 758 * Allocate and ready the aio control info. There is one aiop structure 759 * per daemon. 760 */ 761 aiop = zalloc(aiop_zone); 762 aiop->aiothread = td; 763 aiop->aiothreadflags |= AIOP_FREE; 764 765 s = splnet(); 766 767 /* 768 * Place thread (lightweight process) onto the AIO free thread list. 769 */ 770 if (TAILQ_EMPTY(&aio_freeproc)) 771 wakeup(&aio_freeproc); 772 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 773 774 splx(s); 775 776 /* 777 * Get rid of our current filedescriptors. AIOD's don't need any 778 * filedescriptors, except as temporarily inherited from the client. 779 */ 780 fdfree(td); 781 mycp->p_fd = NULL; 782 783 /* The daemon resides in its own pgrp. */ 784 enterpgrp(mycp, mycp->p_pid, 1); 785 786 /* Mark special process type. */ 787 mycp->p_flag |= P_SYSTEM; 788 789 /* 790 * Wakeup parent process. (Parent sleeps to keep from blasting away 791 * and creating too many daemons.) 792 */ 793 wakeup(mycp); 794 795 for (;;) { 796 /* 797 * curcp is the current daemon process context. 798 * userp is the current user process context. 799 */ 800 curcp = mycp; 801 802 /* 803 * Take daemon off of free queue 804 */ 805 if (aiop->aiothreadflags & AIOP_FREE) { 806 s = splnet(); 807 TAILQ_REMOVE(&aio_freeproc, aiop, list); 808 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 809 aiop->aiothreadflags &= ~AIOP_FREE; 810 splx(s); 811 } 812 aiop->aiothreadflags &= ~AIOP_SCHED; 813 814 /* 815 * Check for jobs. 816 */ 817 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 818 cb = &aiocbe->uaiocb; 819 userp = aiocbe->userproc; 820 821 aiocbe->jobstate = JOBST_JOBRUNNING; 822 823 /* 824 * Connect to process address space for user program. 825 */ 826 if (userp != curcp) { 827 /* 828 * Save the current address space that we are 829 * connected to. 830 */ 831 tmpvm = mycp->p_vmspace; 832 833 /* 834 * Point to the new user address space, and 835 * refer to it. 836 */ 837 mycp->p_vmspace = userp->p_vmspace; 838 mycp->p_vmspace->vm_refcnt++; 839 840 /* Activate the new mapping. */ 841 pmap_activate(&mycp->p_thread); 842 843 /* 844 * If the old address space wasn't the daemons 845 * own address space, then we need to remove the 846 * daemon's reference from the other process 847 * that it was acting on behalf of. 848 */ 849 if (tmpvm != myvm) { 850 vmspace_free(tmpvm); 851 } 852 853 /* 854 * Disassociate from previous clients file 855 * descriptors, and associate to the new clients 856 * descriptors. Note that the daemon doesn't 857 * need to worry about its orginal descriptors, 858 * because they were originally freed. 859 */ 860 if (mycp->p_fd) 861 fdfree(td); 862 mycp->p_fd = fdshare(userp); 863 curcp = userp; 864 } 865 866 ki = userp->p_aioinfo; 867 lj = aiocbe->lio; 868 869 /* Account for currently active jobs. */ 870 ki->kaio_active_count++; 871 872 /* Do the I/O function. */ 873 aiocbe->jobaiothread = aiop; 874 aio_process(aiocbe); 875 876 /* Decrement the active job count. */ 877 ki->kaio_active_count--; 878 879 /* 880 * Increment the completion count for wakeup/signal 881 * comparisons. 882 */ 883 aiocbe->jobflags |= AIOCBLIST_DONE; 884 ki->kaio_queue_finished_count++; 885 if (lj) 886 lj->lioj_queue_finished_count++; 887 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags 888 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { 889 ki->kaio_flags &= ~KAIO_WAKEUP; 890 wakeup(userp); 891 } 892 893 s = splbio(); 894 if (lj && (lj->lioj_flags & 895 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { 896 if ((lj->lioj_queue_finished_count == 897 lj->lioj_queue_count) && 898 (lj->lioj_buffer_finished_count == 899 lj->lioj_buffer_count)) { 900 PROC_LOCK(userp); 901 psignal(userp, 902 lj->lioj_signal.sigev_signo); 903 PROC_UNLOCK(userp); 904 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 905 } 906 } 907 splx(s); 908 909 aiocbe->jobstate = JOBST_JOBFINISHED; 910 911 /* 912 * If the I/O request should be automatically rundown, 913 * do the needed cleanup. Otherwise, place the queue 914 * entry for the just finished I/O request into the done 915 * queue for the associated client. 916 */ 917 s = splnet(); 918 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 919 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 920 zfree(aiocb_zone, aiocbe); 921 } else { 922 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 923 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, 924 plist); 925 } 926 splx(s); 927 KNOTE(&aiocbe->klist, 0); 928 929 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 930 wakeup(aiocbe); 931 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 932 } 933 934 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 935 PROC_LOCK(userp); 936 psignal(userp, cb->aio_sigevent.sigev_signo); 937 PROC_UNLOCK(userp); 938 } 939 } 940 941 /* 942 * Disconnect from user address space. 943 */ 944 if (curcp != mycp) { 945 /* Get the user address space to disconnect from. */ 946 tmpvm = mycp->p_vmspace; 947 948 /* Get original address space for daemon. */ 949 mycp->p_vmspace = myvm; 950 951 /* Activate the daemon's address space. */ 952 pmap_activate(&mycp->p_thread); 953 #ifdef DIAGNOSTIC 954 if (tmpvm == myvm) { 955 printf("AIOD: vmspace problem -- %d\n", 956 mycp->p_pid); 957 } 958 #endif 959 /* Remove our vmspace reference. */ 960 vmspace_free(tmpvm); 961 962 /* 963 * Disassociate from the user process's file 964 * descriptors. 965 */ 966 if (mycp->p_fd) 967 fdfree(td); 968 mycp->p_fd = NULL; 969 curcp = mycp; 970 } 971 972 /* 973 * If we are the first to be put onto the free queue, wakeup 974 * anyone waiting for a daemon. 975 */ 976 s = splnet(); 977 TAILQ_REMOVE(&aio_activeproc, aiop, list); 978 if (TAILQ_EMPTY(&aio_freeproc)) 979 wakeup(&aio_freeproc); 980 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 981 aiop->aiothreadflags |= AIOP_FREE; 982 splx(s); 983 984 /* 985 * If daemon is inactive for a long time, allow it to exit, 986 * thereby freeing resources. 987 */ 988 if ((aiop->aiothreadflags & AIOP_SCHED) == 0 && 989 tsleep(aiop->aiothread, PRIBIO, "aiordy", aiod_lifetime)) { 990 s = splnet(); 991 if (TAILQ_EMPTY(&aio_jobs)) { 992 if ((aiop->aiothreadflags & AIOP_FREE) && 993 (num_aio_procs > target_aio_procs)) { 994 TAILQ_REMOVE(&aio_freeproc, aiop, list); 995 splx(s); 996 zfree(aiop_zone, aiop); 997 num_aio_procs--; 998 #ifdef DIAGNOSTIC 999 if (mycp->p_vmspace->vm_refcnt <= 1) { 1000 printf("AIOD: bad vm refcnt for" 1001 " exiting daemon: %d\n", 1002 mycp->p_vmspace->vm_refcnt); 1003 } 1004 #endif 1005 kthread_exit(0); 1006 } 1007 } 1008 splx(s); 1009 } 1010 } 1011 } 1012 1013 /* 1014 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 1015 * AIO daemon modifies its environment itself. 1016 */ 1017 static int 1018 aio_newproc() 1019 { 1020 int error; 1021 struct proc *p; 1022 1023 error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d", 1024 num_aio_procs); 1025 if (error) 1026 return error; 1027 1028 /* 1029 * Wait until daemon is started, but continue on just in case to 1030 * handle error conditions. 1031 */ 1032 error = tsleep(p, PZERO, "aiosta", aiod_timeout); 1033 1034 num_aio_procs++; 1035 1036 return error; 1037 } 1038 1039 /* 1040 * Try the high-performance, low-overhead physio method for eligible 1041 * VCHR devices. This method doesn't use an aio helper thread, and 1042 * thus has very low overhead. 1043 * 1044 * Assumes that the caller, _aio_aqueue(), has incremented the file 1045 * structure's reference count, preventing its deallocation for the 1046 * duration of this call. 1047 */ 1048 static int 1049 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 1050 { 1051 int error; 1052 struct aiocb *cb; 1053 struct file *fp; 1054 struct buf *bp; 1055 struct vnode *vp; 1056 struct kaioinfo *ki; 1057 struct filedesc *fdp; 1058 struct aio_liojob *lj; 1059 int fd; 1060 int s; 1061 int notify; 1062 1063 cb = &aiocbe->uaiocb; 1064 fdp = p->p_fd; 1065 fd = cb->aio_fildes; 1066 fp = fdp->fd_ofiles[fd]; 1067 1068 if (fp->f_type != DTYPE_VNODE) 1069 return (-1); 1070 1071 vp = (struct vnode *)fp->f_data; 1072 1073 /* 1074 * If its not a disk, we don't want to return a positive error. 1075 * It causes the aio code to not fall through to try the thread 1076 * way when you're talking to a regular file. 1077 */ 1078 if (!vn_isdisk(vp, &error)) { 1079 if (error == ENOTBLK) 1080 return (-1); 1081 else 1082 return (error); 1083 } 1084 1085 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 1086 return (-1); 1087 1088 if (cb->aio_nbytes > 1089 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) 1090 return (-1); 1091 1092 ki = p->p_aioinfo; 1093 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 1094 return (-1); 1095 1096 ki->kaio_buffer_count++; 1097 1098 lj = aiocbe->lio; 1099 if (lj) 1100 lj->lioj_buffer_count++; 1101 1102 /* Create and build a buffer header for a transfer. */ 1103 bp = (struct buf *)getpbuf(NULL); 1104 BUF_KERNPROC(bp); 1105 1106 /* 1107 * Get a copy of the kva from the physical buffer. 1108 */ 1109 bp->b_caller1 = p; 1110 bp->b_dev = vp->v_rdev; 1111 error = bp->b_error = 0; 1112 1113 bp->b_bcount = cb->aio_nbytes; 1114 bp->b_bufsize = cb->aio_nbytes; 1115 bp->b_flags = B_PHYS; 1116 bp->b_iodone = aio_physwakeup; 1117 bp->b_saveaddr = bp->b_data; 1118 bp->b_data = (void *)(uintptr_t)cb->aio_buf; 1119 bp->b_blkno = btodb(cb->aio_offset); 1120 1121 if (cb->aio_lio_opcode == LIO_WRITE) { 1122 bp->b_iocmd = BIO_WRITE; 1123 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) { 1124 error = EFAULT; 1125 goto doerror; 1126 } 1127 } else { 1128 bp->b_iocmd = BIO_READ; 1129 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) { 1130 error = EFAULT; 1131 goto doerror; 1132 } 1133 } 1134 1135 /* Bring buffer into kernel space. */ 1136 vmapbuf(bp); 1137 1138 s = splbio(); 1139 aiocbe->bp = bp; 1140 bp->b_spc = (void *)aiocbe; 1141 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1142 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1143 aiocbe->jobstate = JOBST_JOBQBUF; 1144 cb->_aiocb_private.status = cb->aio_nbytes; 1145 num_buf_aio++; 1146 bp->b_error = 0; 1147 1148 splx(s); 1149 1150 /* Perform transfer. */ 1151 DEV_STRATEGY(bp, 0); 1152 1153 notify = 0; 1154 s = splbio(); 1155 1156 /* 1157 * If we had an error invoking the request, or an error in processing 1158 * the request before we have returned, we process it as an error in 1159 * transfer. Note that such an I/O error is not indicated immediately, 1160 * but is returned using the aio_error mechanism. In this case, 1161 * aio_suspend will return immediately. 1162 */ 1163 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) { 1164 struct aiocb *job = aiocbe->uuaiocb; 1165 1166 aiocbe->uaiocb._aiocb_private.status = 0; 1167 suword(&job->_aiocb_private.status, 0); 1168 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1169 suword(&job->_aiocb_private.error, bp->b_error); 1170 1171 ki->kaio_buffer_finished_count++; 1172 1173 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1174 aiocbe->jobstate = JOBST_JOBBFINISHED; 1175 aiocbe->jobflags |= AIOCBLIST_DONE; 1176 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1177 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1178 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1179 notify = 1; 1180 } 1181 } 1182 splx(s); 1183 if (notify) 1184 KNOTE(&aiocbe->klist, 0); 1185 return 0; 1186 1187 doerror: 1188 ki->kaio_buffer_count--; 1189 if (lj) 1190 lj->lioj_buffer_count--; 1191 aiocbe->bp = NULL; 1192 relpbuf(bp, NULL); 1193 return error; 1194 } 1195 1196 /* 1197 * This waits/tests physio completion. 1198 */ 1199 static int 1200 aio_fphysio(struct aiocblist *iocb) 1201 { 1202 int s; 1203 struct buf *bp; 1204 int error; 1205 1206 bp = iocb->bp; 1207 1208 s = splbio(); 1209 while ((bp->b_flags & B_DONE) == 0) { 1210 if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) { 1211 if ((bp->b_flags & B_DONE) == 0) { 1212 splx(s); 1213 return EINPROGRESS; 1214 } else 1215 break; 1216 } 1217 } 1218 splx(s); 1219 1220 /* Release mapping into kernel space. */ 1221 vunmapbuf(bp); 1222 iocb->bp = 0; 1223 1224 error = 0; 1225 1226 /* Check for an error. */ 1227 if (bp->b_ioflags & BIO_ERROR) 1228 error = bp->b_error; 1229 1230 relpbuf(bp, NULL); 1231 return (error); 1232 } 1233 1234 /* 1235 * Wake up aio requests that may be serviceable now. 1236 */ 1237 static void 1238 aio_swake_cb(struct socket *so, struct sockbuf *sb) 1239 { 1240 struct aiocblist *cb,*cbn; 1241 struct proc *p; 1242 struct kaioinfo *ki = NULL; 1243 int opcode, wakecount = 0; 1244 struct aiothreadlist *aiop; 1245 1246 if (sb == &so->so_snd) { 1247 opcode = LIO_WRITE; 1248 so->so_snd.sb_flags &= ~SB_AIO; 1249 } else { 1250 opcode = LIO_READ; 1251 so->so_rcv.sb_flags &= ~SB_AIO; 1252 } 1253 1254 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { 1255 cbn = TAILQ_NEXT(cb, list); 1256 if (opcode == cb->uaiocb.aio_lio_opcode) { 1257 p = cb->userproc; 1258 ki = p->p_aioinfo; 1259 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1260 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); 1261 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1262 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); 1263 wakecount++; 1264 if (cb->jobstate != JOBST_JOBQGLOBAL) 1265 panic("invalid queue value"); 1266 } 1267 } 1268 1269 while (wakecount--) { 1270 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { 1271 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1272 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1273 aiop->aiothreadflags &= ~AIOP_FREE; 1274 wakeup(aiop->aiothread); 1275 } 1276 } 1277 } 1278 1279 /* 1280 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1281 * technique is done in this code. 1282 */ 1283 static int 1284 _aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type) 1285 { 1286 struct proc *p = td->td_proc; 1287 struct filedesc *fdp; 1288 struct file *fp; 1289 unsigned int fd; 1290 struct socket *so; 1291 int s; 1292 int error; 1293 int opcode; 1294 struct aiocblist *aiocbe; 1295 struct aiothreadlist *aiop; 1296 struct kaioinfo *ki; 1297 struct kevent kev; 1298 struct kqueue *kq; 1299 struct file *kq_fp; 1300 1301 aiocbe = zalloc(aiocb_zone); 1302 aiocbe->inputcharge = 0; 1303 aiocbe->outputcharge = 0; 1304 callout_handle_init(&aiocbe->timeouthandle); 1305 SLIST_INIT(&aiocbe->klist); 1306 1307 suword(&job->_aiocb_private.status, -1); 1308 suword(&job->_aiocb_private.error, 0); 1309 suword(&job->_aiocb_private.kernelinfo, -1); 1310 1311 error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb)); 1312 if (error) { 1313 suword(&job->_aiocb_private.error, error); 1314 zfree(aiocb_zone, aiocbe); 1315 return error; 1316 } 1317 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL && 1318 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { 1319 zfree(aiocb_zone, aiocbe); 1320 return EINVAL; 1321 } 1322 1323 /* Save userspace address of the job info. */ 1324 aiocbe->uuaiocb = job; 1325 1326 /* Get the opcode. */ 1327 if (type != LIO_NOP) 1328 aiocbe->uaiocb.aio_lio_opcode = type; 1329 opcode = aiocbe->uaiocb.aio_lio_opcode; 1330 1331 /* Get the fd info for process. */ 1332 fdp = p->p_fd; 1333 1334 /* 1335 * Range check file descriptor. 1336 */ 1337 fd = aiocbe->uaiocb.aio_fildes; 1338 if (fd >= fdp->fd_nfiles) { 1339 zfree(aiocb_zone, aiocbe); 1340 if (type == 0) 1341 suword(&job->_aiocb_private.error, EBADF); 1342 return EBADF; 1343 } 1344 1345 fp = aiocbe->fd_file = fdp->fd_ofiles[fd]; 1346 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 1347 0))) { 1348 zfree(aiocb_zone, aiocbe); 1349 if (type == 0) 1350 suword(&job->_aiocb_private.error, EBADF); 1351 return EBADF; 1352 } 1353 1354 if (aiocbe->uaiocb.aio_offset == -1LL) { 1355 zfree(aiocb_zone, aiocbe); 1356 if (type == 0) 1357 suword(&job->_aiocb_private.error, EINVAL); 1358 return EINVAL; 1359 } 1360 1361 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1362 if (error) { 1363 zfree(aiocb_zone, aiocbe); 1364 if (type == 0) 1365 suword(&job->_aiocb_private.error, EINVAL); 1366 return error; 1367 } 1368 1369 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1370 if (jobrefid == LONG_MAX) 1371 jobrefid = 1; 1372 else 1373 jobrefid++; 1374 1375 if (opcode == LIO_NOP) { 1376 zfree(aiocb_zone, aiocbe); 1377 if (type == 0) { 1378 suword(&job->_aiocb_private.error, 0); 1379 suword(&job->_aiocb_private.status, 0); 1380 suword(&job->_aiocb_private.kernelinfo, 0); 1381 } 1382 return 0; 1383 } 1384 1385 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1386 zfree(aiocb_zone, aiocbe); 1387 if (type == 0) { 1388 suword(&job->_aiocb_private.status, 0); 1389 suword(&job->_aiocb_private.error, EINVAL); 1390 } 1391 return EINVAL; 1392 } 1393 1394 fhold(fp); 1395 1396 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { 1397 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1398 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr; 1399 } 1400 else { 1401 /* 1402 * This method for requesting kevent-based notification won't 1403 * work on the alpha, since we're passing in a pointer 1404 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- 1405 * based method instead. 1406 */ 1407 struct kevent *kevp; 1408 1409 kevp = (struct kevent *)(uintptr_t)job->aio_lio_opcode; 1410 if (kevp == NULL) 1411 goto no_kqueue; 1412 1413 error = copyin(kevp, &kev, sizeof(kev)); 1414 if (error) 1415 goto aqueue_fail; 1416 } 1417 if ((u_int)kev.ident >= fdp->fd_nfiles || 1418 (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL || 1419 (kq_fp->f_type != DTYPE_KQUEUE)) { 1420 error = EBADF; 1421 goto aqueue_fail; 1422 } 1423 kq = (struct kqueue *)kq_fp->f_data; 1424 kev.ident = (uintptr_t)aiocbe; 1425 kev.filter = EVFILT_AIO; 1426 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1427 error = kqueue_register(kq, &kev, td); 1428 aqueue_fail: 1429 if (error) { 1430 zfree(aiocb_zone, aiocbe); 1431 if (type == 0) 1432 suword(&job->_aiocb_private.error, error); 1433 goto done; 1434 } 1435 no_kqueue: 1436 1437 suword(&job->_aiocb_private.error, EINPROGRESS); 1438 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1439 aiocbe->userproc = p; 1440 aiocbe->jobflags = 0; 1441 aiocbe->lio = lj; 1442 ki = p->p_aioinfo; 1443 1444 if (fp->f_type == DTYPE_SOCKET) { 1445 /* 1446 * Alternate queueing for socket ops: Reach down into the 1447 * descriptor to get the socket data. Then check to see if the 1448 * socket is ready to be read or written (based on the requested 1449 * operation). 1450 * 1451 * If it is not ready for io, then queue the aiocbe on the 1452 * socket, and set the flags so we get a call when sbnotify() 1453 * happens. 1454 */ 1455 so = (struct socket *)fp->f_data; 1456 s = splnet(); 1457 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1458 LIO_WRITE) && (!sowriteable(so)))) { 1459 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1460 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); 1461 if (opcode == LIO_READ) 1462 so->so_rcv.sb_flags |= SB_AIO; 1463 else 1464 so->so_snd.sb_flags |= SB_AIO; 1465 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ 1466 ki->kaio_queue_count++; 1467 num_queue_count++; 1468 splx(s); 1469 error = 0; 1470 goto done; 1471 } 1472 splx(s); 1473 } 1474 1475 if ((error = aio_qphysio(p, aiocbe)) == 0) 1476 goto done; 1477 if (error > 0) { 1478 suword(&job->_aiocb_private.status, 0); 1479 aiocbe->uaiocb._aiocb_private.error = error; 1480 suword(&job->_aiocb_private.error, error); 1481 goto done; 1482 } 1483 1484 /* No buffer for daemon I/O. */ 1485 aiocbe->bp = NULL; 1486 1487 ki->kaio_queue_count++; 1488 if (lj) 1489 lj->lioj_queue_count++; 1490 s = splnet(); 1491 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1492 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1493 splx(s); 1494 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1495 1496 num_queue_count++; 1497 error = 0; 1498 1499 /* 1500 * If we don't have a free AIO process, and we are below our quota, then 1501 * start one. Otherwise, depend on the subsequent I/O completions to 1502 * pick-up this job. If we don't sucessfully create the new process 1503 * (thread) due to resource issues, we return an error for now (EAGAIN), 1504 * which is likely not the correct thing to do. 1505 */ 1506 s = splnet(); 1507 retryproc: 1508 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1509 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1510 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1511 aiop->aiothreadflags &= ~AIOP_FREE; 1512 wakeup(aiop->aiothread); 1513 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1514 ((ki->kaio_active_count + num_aio_resv_start) < 1515 ki->kaio_maxactive_count)) { 1516 num_aio_resv_start++; 1517 if ((error = aio_newproc()) == 0) { 1518 num_aio_resv_start--; 1519 td->td_retval[0] = 0; 1520 goto retryproc; 1521 } 1522 num_aio_resv_start--; 1523 } 1524 splx(s); 1525 done: 1526 fdrop(fp, td); 1527 return error; 1528 } 1529 1530 /* 1531 * This routine queues an AIO request, checking for quotas. 1532 */ 1533 static int 1534 aio_aqueue(struct thread *td, struct aiocb *job, int type) 1535 { 1536 struct proc *p = td->td_proc; 1537 struct kaioinfo *ki; 1538 1539 if (p->p_aioinfo == NULL) 1540 aio_init_aioinfo(p); 1541 1542 if (num_queue_count >= max_queue_count) 1543 return EAGAIN; 1544 1545 ki = p->p_aioinfo; 1546 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1547 return EAGAIN; 1548 1549 return _aio_aqueue(td, job, NULL, type); 1550 } 1551 1552 /* 1553 * Support the aio_return system call, as a side-effect, kernel resources are 1554 * released. 1555 */ 1556 int 1557 aio_return(struct thread *td, struct aio_return_args *uap) 1558 { 1559 struct proc *p = td->td_proc; 1560 int s; 1561 int jobref; 1562 struct aiocblist *cb, *ncb; 1563 struct aiocb *ujob; 1564 struct kaioinfo *ki; 1565 1566 ki = p->p_aioinfo; 1567 if (ki == NULL) 1568 return EINVAL; 1569 1570 ujob = uap->aiocbp; 1571 1572 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1573 if (jobref == -1 || jobref == 0) 1574 return EINVAL; 1575 1576 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1577 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1578 jobref) { 1579 if (ujob == cb->uuaiocb) { 1580 td->td_retval[0] = 1581 cb->uaiocb._aiocb_private.status; 1582 } else 1583 td->td_retval[0] = EFAULT; 1584 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1585 p->p_stats->p_ru.ru_oublock += 1586 cb->outputcharge; 1587 cb->outputcharge = 0; 1588 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1589 p->p_stats->p_ru.ru_inblock += cb->inputcharge; 1590 cb->inputcharge = 0; 1591 } 1592 aio_free_entry(cb); 1593 return 0; 1594 } 1595 } 1596 s = splbio(); 1597 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { 1598 ncb = TAILQ_NEXT(cb, plist); 1599 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) 1600 == jobref) { 1601 splx(s); 1602 if (ujob == cb->uuaiocb) { 1603 td->td_retval[0] = 1604 cb->uaiocb._aiocb_private.status; 1605 } else 1606 td->td_retval[0] = EFAULT; 1607 aio_free_entry(cb); 1608 return 0; 1609 } 1610 } 1611 splx(s); 1612 1613 return (EINVAL); 1614 } 1615 1616 /* 1617 * Allow a process to wakeup when any of the I/O requests are completed. 1618 */ 1619 int 1620 aio_suspend(struct thread *td, struct aio_suspend_args *uap) 1621 { 1622 struct proc *p = td->td_proc; 1623 struct timeval atv; 1624 struct timespec ts; 1625 struct aiocb *const *cbptr, *cbp; 1626 struct kaioinfo *ki; 1627 struct aiocblist *cb; 1628 int i; 1629 int njoblist; 1630 int error, s, timo; 1631 int *ijoblist; 1632 struct aiocb **ujoblist; 1633 1634 if (uap->nent > AIO_LISTIO_MAX) 1635 return EINVAL; 1636 1637 timo = 0; 1638 if (uap->timeout) { 1639 /* Get timespec struct. */ 1640 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1641 return error; 1642 1643 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1644 return (EINVAL); 1645 1646 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1647 if (itimerfix(&atv)) 1648 return (EINVAL); 1649 timo = tvtohz(&atv); 1650 } 1651 1652 ki = p->p_aioinfo; 1653 if (ki == NULL) 1654 return EAGAIN; 1655 1656 njoblist = 0; 1657 ijoblist = zalloc(aiol_zone); 1658 ujoblist = zalloc(aiol_zone); 1659 cbptr = uap->aiocbp; 1660 1661 for (i = 0; i < uap->nent; i++) { 1662 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 1663 if (cbp == 0) 1664 continue; 1665 ujoblist[njoblist] = cbp; 1666 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1667 njoblist++; 1668 } 1669 1670 if (njoblist == 0) { 1671 zfree(aiol_zone, ijoblist); 1672 zfree(aiol_zone, ujoblist); 1673 return 0; 1674 } 1675 1676 error = 0; 1677 for (;;) { 1678 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1679 for (i = 0; i < njoblist; i++) { 1680 if (((intptr_t) 1681 cb->uaiocb._aiocb_private.kernelinfo) == 1682 ijoblist[i]) { 1683 if (ujoblist[i] != cb->uuaiocb) 1684 error = EINVAL; 1685 zfree(aiol_zone, ijoblist); 1686 zfree(aiol_zone, ujoblist); 1687 return error; 1688 } 1689 } 1690 } 1691 1692 s = splbio(); 1693 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = 1694 TAILQ_NEXT(cb, plist)) { 1695 for (i = 0; i < njoblist; i++) { 1696 if (((intptr_t) 1697 cb->uaiocb._aiocb_private.kernelinfo) == 1698 ijoblist[i]) { 1699 splx(s); 1700 if (ujoblist[i] != cb->uuaiocb) 1701 error = EINVAL; 1702 zfree(aiol_zone, ijoblist); 1703 zfree(aiol_zone, ujoblist); 1704 return error; 1705 } 1706 } 1707 } 1708 1709 ki->kaio_flags |= KAIO_WAKEUP; 1710 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo); 1711 splx(s); 1712 1713 if (error == ERESTART || error == EINTR) { 1714 zfree(aiol_zone, ijoblist); 1715 zfree(aiol_zone, ujoblist); 1716 return EINTR; 1717 } else if (error == EWOULDBLOCK) { 1718 zfree(aiol_zone, ijoblist); 1719 zfree(aiol_zone, ujoblist); 1720 return EAGAIN; 1721 } 1722 } 1723 1724 /* NOTREACHED */ 1725 return EINVAL; 1726 } 1727 1728 /* 1729 * aio_cancel cancels any non-physio aio operations not currently in 1730 * progress. 1731 */ 1732 int 1733 aio_cancel(struct thread *td, struct aio_cancel_args *uap) 1734 { 1735 struct proc *p = td->td_proc; 1736 struct kaioinfo *ki; 1737 struct aiocblist *cbe, *cbn; 1738 struct file *fp; 1739 struct filedesc *fdp; 1740 struct socket *so; 1741 struct proc *po; 1742 int s,error; 1743 int cancelled=0; 1744 int notcancelled=0; 1745 struct vnode *vp; 1746 1747 fdp = p->p_fd; 1748 if ((u_int)uap->fd >= fdp->fd_nfiles || 1749 (fp = fdp->fd_ofiles[uap->fd]) == NULL) 1750 return (EBADF); 1751 1752 if (fp->f_type == DTYPE_VNODE) { 1753 vp = (struct vnode *)fp->f_data; 1754 1755 if (vn_isdisk(vp,&error)) { 1756 td->td_retval[0] = AIO_NOTCANCELED; 1757 return 0; 1758 } 1759 } else if (fp->f_type == DTYPE_SOCKET) { 1760 so = (struct socket *)fp->f_data; 1761 1762 s = splnet(); 1763 1764 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { 1765 cbn = TAILQ_NEXT(cbe, list); 1766 if ((uap->aiocbp == NULL) || 1767 (uap->aiocbp == cbe->uuaiocb) ) { 1768 po = cbe->userproc; 1769 ki = po->p_aioinfo; 1770 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1771 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); 1772 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); 1773 if (ki->kaio_flags & KAIO_WAKEUP) { 1774 wakeup(po); 1775 } 1776 cbe->jobstate = JOBST_JOBFINISHED; 1777 cbe->uaiocb._aiocb_private.status=-1; 1778 cbe->uaiocb._aiocb_private.error=ECANCELED; 1779 cancelled++; 1780 /* XXX cancelled, knote? */ 1781 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1782 SIGEV_SIGNAL) { 1783 PROC_LOCK(cbe->userproc); 1784 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1785 PROC_UNLOCK(cbe->userproc); 1786 } 1787 if (uap->aiocbp) 1788 break; 1789 } 1790 } 1791 splx(s); 1792 1793 if ((cancelled) && (uap->aiocbp)) { 1794 td->td_retval[0] = AIO_CANCELED; 1795 return 0; 1796 } 1797 } 1798 ki=p->p_aioinfo; 1799 s = splnet(); 1800 1801 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { 1802 cbn = TAILQ_NEXT(cbe, plist); 1803 1804 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1805 ((uap->aiocbp == NULL ) || 1806 (uap->aiocbp == cbe->uuaiocb))) { 1807 1808 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1809 TAILQ_REMOVE(&aio_jobs, cbe, list); 1810 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1811 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, 1812 plist); 1813 cancelled++; 1814 ki->kaio_queue_finished_count++; 1815 cbe->jobstate = JOBST_JOBFINISHED; 1816 cbe->uaiocb._aiocb_private.status = -1; 1817 cbe->uaiocb._aiocb_private.error = ECANCELED; 1818 /* XXX cancelled, knote? */ 1819 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1820 SIGEV_SIGNAL) { 1821 PROC_LOCK(cbe->userproc); 1822 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1823 PROC_UNLOCK(cbe->userproc); 1824 } 1825 } else { 1826 notcancelled++; 1827 } 1828 } 1829 } 1830 splx(s); 1831 1832 if (notcancelled) { 1833 td->td_retval[0] = AIO_NOTCANCELED; 1834 return 0; 1835 } 1836 if (cancelled) { 1837 td->td_retval[0] = AIO_CANCELED; 1838 return 0; 1839 } 1840 td->td_retval[0] = AIO_ALLDONE; 1841 1842 return 0; 1843 } 1844 1845 /* 1846 * aio_error is implemented in the kernel level for compatibility purposes only. 1847 * For a user mode async implementation, it would be best to do it in a userland 1848 * subroutine. 1849 */ 1850 int 1851 aio_error(struct thread *td, struct aio_error_args *uap) 1852 { 1853 struct proc *p = td->td_proc; 1854 int s; 1855 struct aiocblist *cb; 1856 struct kaioinfo *ki; 1857 int jobref; 1858 1859 ki = p->p_aioinfo; 1860 if (ki == NULL) 1861 return EINVAL; 1862 1863 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1864 if ((jobref == -1) || (jobref == 0)) 1865 return EINVAL; 1866 1867 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1868 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1869 jobref) { 1870 td->td_retval[0] = cb->uaiocb._aiocb_private.error; 1871 return 0; 1872 } 1873 } 1874 1875 s = splnet(); 1876 1877 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, 1878 plist)) { 1879 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1880 jobref) { 1881 td->td_retval[0] = EINPROGRESS; 1882 splx(s); 1883 return 0; 1884 } 1885 } 1886 1887 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, 1888 plist)) { 1889 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1890 jobref) { 1891 td->td_retval[0] = EINPROGRESS; 1892 splx(s); 1893 return 0; 1894 } 1895 } 1896 splx(s); 1897 1898 s = splbio(); 1899 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, 1900 plist)) { 1901 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1902 jobref) { 1903 td->td_retval[0] = cb->uaiocb._aiocb_private.error; 1904 splx(s); 1905 return 0; 1906 } 1907 } 1908 1909 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, 1910 plist)) { 1911 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1912 jobref) { 1913 td->td_retval[0] = EINPROGRESS; 1914 splx(s); 1915 return 0; 1916 } 1917 } 1918 splx(s); 1919 1920 #if (0) 1921 /* 1922 * Hack for lio. 1923 */ 1924 status = fuword(&uap->aiocbp->_aiocb_private.status); 1925 if (status == -1) 1926 return fuword(&uap->aiocbp->_aiocb_private.error); 1927 #endif 1928 return EINVAL; 1929 } 1930 1931 int 1932 aio_read(struct thread *td, struct aio_read_args *uap) 1933 { 1934 1935 return aio_aqueue(td, uap->aiocbp, LIO_READ); 1936 } 1937 1938 int 1939 aio_write(struct thread *td, struct aio_write_args *uap) 1940 { 1941 1942 return aio_aqueue(td, uap->aiocbp, LIO_WRITE); 1943 } 1944 1945 int 1946 lio_listio(struct thread *td, struct lio_listio_args *uap) 1947 { 1948 struct proc *p = td->td_proc; 1949 int nent, nentqueued; 1950 struct aiocb *iocb, * const *cbptr; 1951 struct aiocblist *cb; 1952 struct kaioinfo *ki; 1953 struct aio_liojob *lj; 1954 int error, runningcode; 1955 int nerror; 1956 int i; 1957 int s; 1958 1959 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 1960 return EINVAL; 1961 1962 nent = uap->nent; 1963 if (nent > AIO_LISTIO_MAX) 1964 return EINVAL; 1965 1966 if (p->p_aioinfo == NULL) 1967 aio_init_aioinfo(p); 1968 1969 if ((nent + num_queue_count) > max_queue_count) 1970 return EAGAIN; 1971 1972 ki = p->p_aioinfo; 1973 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) 1974 return EAGAIN; 1975 1976 lj = zalloc(aiolio_zone); 1977 if (!lj) 1978 return EAGAIN; 1979 1980 lj->lioj_flags = 0; 1981 lj->lioj_buffer_count = 0; 1982 lj->lioj_buffer_finished_count = 0; 1983 lj->lioj_queue_count = 0; 1984 lj->lioj_queue_finished_count = 0; 1985 lj->lioj_ki = ki; 1986 1987 /* 1988 * Setup signal. 1989 */ 1990 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 1991 error = copyin(uap->sig, &lj->lioj_signal, 1992 sizeof(lj->lioj_signal)); 1993 if (error) { 1994 zfree(aiolio_zone, lj); 1995 return error; 1996 } 1997 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { 1998 zfree(aiolio_zone, lj); 1999 return EINVAL; 2000 } 2001 lj->lioj_flags |= LIOJ_SIGNAL; 2002 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 2003 } else 2004 lj->lioj_flags &= ~LIOJ_SIGNAL; 2005 2006 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2007 /* 2008 * Get pointers to the list of I/O requests. 2009 */ 2010 nerror = 0; 2011 nentqueued = 0; 2012 cbptr = uap->acb_list; 2013 for (i = 0; i < uap->nent; i++) { 2014 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2015 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) { 2016 error = _aio_aqueue(td, iocb, lj, 0); 2017 if (error == 0) 2018 nentqueued++; 2019 else 2020 nerror++; 2021 } 2022 } 2023 2024 /* 2025 * If we haven't queued any, then just return error. 2026 */ 2027 if (nentqueued == 0) 2028 return 0; 2029 2030 /* 2031 * Calculate the appropriate error return. 2032 */ 2033 runningcode = 0; 2034 if (nerror) 2035 runningcode = EIO; 2036 2037 if (uap->mode == LIO_WAIT) { 2038 int command, found, jobref; 2039 2040 for (;;) { 2041 found = 0; 2042 for (i = 0; i < uap->nent; i++) { 2043 /* 2044 * Fetch address of the control buf pointer in 2045 * user space. 2046 */ 2047 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2048 if (((intptr_t)iocb == -1) || ((intptr_t)iocb 2049 == 0)) 2050 continue; 2051 2052 /* 2053 * Fetch the associated command from user space. 2054 */ 2055 command = fuword(&iocb->aio_lio_opcode); 2056 if (command == LIO_NOP) { 2057 found++; 2058 continue; 2059 } 2060 2061 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 2062 2063 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 2064 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2065 == jobref) { 2066 if (cb->uaiocb.aio_lio_opcode 2067 == LIO_WRITE) { 2068 p->p_stats->p_ru.ru_oublock 2069 += 2070 cb->outputcharge; 2071 cb->outputcharge = 0; 2072 } else if (cb->uaiocb.aio_lio_opcode 2073 == LIO_READ) { 2074 p->p_stats->p_ru.ru_inblock 2075 += cb->inputcharge; 2076 cb->inputcharge = 0; 2077 } 2078 found++; 2079 break; 2080 } 2081 } 2082 2083 s = splbio(); 2084 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) { 2085 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2086 == jobref) { 2087 found++; 2088 break; 2089 } 2090 } 2091 splx(s); 2092 } 2093 2094 /* 2095 * If all I/Os have been disposed of, then we can 2096 * return. 2097 */ 2098 if (found == nentqueued) 2099 return runningcode; 2100 2101 ki->kaio_flags |= KAIO_WAKEUP; 2102 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0); 2103 2104 if (error == EINTR) 2105 return EINTR; 2106 else if (error == EWOULDBLOCK) 2107 return EAGAIN; 2108 } 2109 } 2110 2111 return runningcode; 2112 } 2113 2114 /* 2115 * This is a weird hack so that we can post a signal. It is safe to do so from 2116 * a timeout routine, but *not* from an interrupt routine. 2117 */ 2118 static void 2119 process_signal(void *aioj) 2120 { 2121 struct aiocblist *aiocbe = aioj; 2122 struct aio_liojob *lj = aiocbe->lio; 2123 struct aiocb *cb = &aiocbe->uaiocb; 2124 2125 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && 2126 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { 2127 PROC_LOCK(lj->lioj_ki->kaio_p); 2128 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 2129 PROC_UNLOCK(lj->lioj_ki->kaio_p); 2130 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2131 } 2132 2133 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 2134 PROC_LOCK(aiocbe->userproc); 2135 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); 2136 PROC_UNLOCK(aiocbe->userproc); 2137 } 2138 } 2139 2140 /* 2141 * Interrupt handler for physio, performs the necessary process wakeups, and 2142 * signals. 2143 */ 2144 static void 2145 aio_physwakeup(struct buf *bp) 2146 { 2147 struct aiocblist *aiocbe; 2148 struct proc *p; 2149 struct kaioinfo *ki; 2150 struct aio_liojob *lj; 2151 2152 wakeup(bp); 2153 2154 aiocbe = (struct aiocblist *)bp->b_spc; 2155 if (aiocbe) { 2156 p = bp->b_caller1; 2157 2158 aiocbe->jobstate = JOBST_JOBBFINISHED; 2159 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2160 aiocbe->uaiocb._aiocb_private.error = 0; 2161 aiocbe->jobflags |= AIOCBLIST_DONE; 2162 2163 if (bp->b_ioflags & BIO_ERROR) 2164 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2165 2166 lj = aiocbe->lio; 2167 if (lj) { 2168 lj->lioj_buffer_finished_count++; 2169 2170 /* 2171 * wakeup/signal if all of the interrupt jobs are done. 2172 */ 2173 if (lj->lioj_buffer_finished_count == 2174 lj->lioj_buffer_count) { 2175 /* 2176 * Post a signal if it is called for. 2177 */ 2178 if ((lj->lioj_flags & 2179 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2180 LIOJ_SIGNAL) { 2181 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2182 aiocbe->timeouthandle = 2183 timeout(process_signal, 2184 aiocbe, 0); 2185 } 2186 } 2187 } 2188 2189 ki = p->p_aioinfo; 2190 if (ki) { 2191 ki->kaio_buffer_finished_count++; 2192 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2193 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2194 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2195 2196 KNOTE(&aiocbe->klist, 0); 2197 /* Do the wakeup. */ 2198 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2199 ki->kaio_flags &= ~KAIO_WAKEUP; 2200 wakeup(p); 2201 } 2202 } 2203 2204 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2205 aiocbe->timeouthandle = 2206 timeout(process_signal, aiocbe, 0); 2207 } 2208 } 2209 2210 int 2211 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) 2212 { 2213 struct proc *p = td->td_proc; 2214 struct timeval atv; 2215 struct timespec ts; 2216 struct aiocb **cbptr; 2217 struct kaioinfo *ki; 2218 struct aiocblist *cb = NULL; 2219 int error, s, timo; 2220 2221 suword(uap->aiocbp, (int)NULL); 2222 2223 timo = 0; 2224 if (uap->timeout) { 2225 /* Get timespec struct. */ 2226 error = copyin(uap->timeout, &ts, sizeof(ts)); 2227 if (error) 2228 return error; 2229 2230 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) 2231 return (EINVAL); 2232 2233 TIMESPEC_TO_TIMEVAL(&atv, &ts); 2234 if (itimerfix(&atv)) 2235 return (EINVAL); 2236 timo = tvtohz(&atv); 2237 } 2238 2239 ki = p->p_aioinfo; 2240 if (ki == NULL) 2241 return EAGAIN; 2242 2243 cbptr = uap->aiocbp; 2244 2245 for (;;) { 2246 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { 2247 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); 2248 td->td_retval[0] = cb->uaiocb._aiocb_private.status; 2249 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2250 p->p_stats->p_ru.ru_oublock += 2251 cb->outputcharge; 2252 cb->outputcharge = 0; 2253 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2254 p->p_stats->p_ru.ru_inblock += cb->inputcharge; 2255 cb->inputcharge = 0; 2256 } 2257 aio_free_entry(cb); 2258 return cb->uaiocb._aiocb_private.error; 2259 } 2260 2261 s = splbio(); 2262 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { 2263 splx(s); 2264 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); 2265 td->td_retval[0] = cb->uaiocb._aiocb_private.status; 2266 aio_free_entry(cb); 2267 return cb->uaiocb._aiocb_private.error; 2268 } 2269 2270 ki->kaio_flags |= KAIO_WAKEUP; 2271 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo); 2272 splx(s); 2273 2274 if (error == ERESTART) 2275 return EINTR; 2276 else if (error < 0) 2277 return error; 2278 else if (error == EINTR) 2279 return EINTR; 2280 else if (error == EWOULDBLOCK) 2281 return EAGAIN; 2282 } 2283 } 2284 2285 static int 2286 filt_aioattach(struct knote *kn) 2287 { 2288 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2289 2290 /* 2291 * The aiocbe pointer must be validated before using it, so 2292 * registration is restricted to the kernel; the user cannot 2293 * set EV_FLAG1. 2294 */ 2295 if ((kn->kn_flags & EV_FLAG1) == 0) 2296 return (EPERM); 2297 kn->kn_flags &= ~EV_FLAG1; 2298 2299 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext); 2300 2301 return (0); 2302 } 2303 2304 static void 2305 filt_aiodetach(struct knote *kn) 2306 { 2307 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2308 2309 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext); 2310 } 2311 2312 /*ARGSUSED*/ 2313 static int 2314 filt_aio(struct knote *kn, long hint) 2315 { 2316 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2317 2318 kn->kn_data = aiocbe->uaiocb._aiocb_private.error; 2319 if (aiocbe->jobstate != JOBST_JOBFINISHED && 2320 aiocbe->jobstate != JOBST_JOBBFINISHED) 2321 return (0); 2322 kn->kn_flags |= EV_EOF; 2323 return (1); 2324 } 2325